aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-03 15:28:30 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-03 15:28:30 -0400
commite31fb9e00543e5d3c5b686747d3c862bc09b59f3 (patch)
tree4300b111471a858b542d55d47d587fb8ef52513a
parent824b005c86f91fe02eb2743a4526361f11786f70 (diff)
parent9181f8bf5abf4b9d59b12e878895375b84fe32ba (diff)
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs
Pull ext3 removal, quota & udf fixes from Jan Kara: "The biggest change in the pull is the removal of ext3 filesystem driver (~28k lines removed). Ext4 driver is a full featured replacement these days and both RH and SUSE use it for several years without issues. Also there are some workarounds in VM & block layer mainly for ext3 which we could eventually get rid of. Other larger change is addition of proper error handling for dquot_initialize(). The rest is small fixes and cleanups" [ I wasn't convinced about the ext3 removal and worried about things falling through the cracks for legacy users, but ext4 maintainers piped up and were all unanimously in favor of removal, and maintaining all legacy ext3 support inside ext4. - Linus ] * 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs: udf: Don't modify filesystem for read-only mounts quota: remove an unneeded condition ext4: memory leak on error in ext4_symlink() mm/Kconfig: NEED_BOUNCE_POOL: clean-up condition ext4: Improve ext4 Kconfig test block: Remove forced page bouncing under IO fs: Remove ext3 filesystem driver doc: Update doc about journalling layer jfs: Handle error from dquot_initialize() reiserfs: Handle error from dquot_initialize() ocfs2: Handle error from dquot_initialize() ext4: Handle error from dquot_initialize() ext2: Handle error from dquot_initalize() quota: Propagate error from ->acquire_dquot()
-rw-r--r--Documentation/DocBook/filesystems.tmpl178
-rw-r--r--Documentation/filesystems/ext2.txt4
-rw-r--r--Documentation/filesystems/ext3.txt209
-rw-r--r--Documentation/filesystems/vfs.txt2
-rw-r--r--MAINTAINERS18
-rw-r--r--block/bounce.c31
-rw-r--r--fs/Kconfig5
-rw-r--r--fs/Makefile2
-rw-r--r--fs/ext2/ialloc.c5
-rw-r--r--fs/ext2/inode.c7
-rw-r--r--fs/ext2/namei.c46
-rw-r--r--fs/ext3/Kconfig89
-rw-r--r--fs/ext3/Makefile12
-rw-r--r--fs/ext3/acl.c281
-rw-r--r--fs/ext3/acl.h72
-rw-r--r--fs/ext3/balloc.c2158
-rw-r--r--fs/ext3/bitmap.c20
-rw-r--r--fs/ext3/dir.c537
-rw-r--r--fs/ext3/ext3.h1332
-rw-r--r--fs/ext3/ext3_jbd.c59
-rw-r--r--fs/ext3/file.c79
-rw-r--r--fs/ext3/fsync.c109
-rw-r--r--fs/ext3/hash.c206
-rw-r--r--fs/ext3/ialloc.c706
-rw-r--r--fs/ext3/inode.c3574
-rw-r--r--fs/ext3/ioctl.c327
-rw-r--r--fs/ext3/namei.c2586
-rw-r--r--fs/ext3/namei.h27
-rw-r--r--fs/ext3/resize.c1117
-rw-r--r--fs/ext3/super.c3165
-rw-r--r--fs/ext3/symlink.c46
-rw-r--r--fs/ext3/xattr.c1330
-rw-r--r--fs/ext3/xattr.h136
-rw-r--r--fs/ext3/xattr_security.c78
-rw-r--r--fs/ext3/xattr_trusted.c54
-rw-r--r--fs/ext3/xattr_user.c58
-rw-r--r--fs/ext4/Kconfig54
-rw-r--r--fs/ext4/ialloc.c6
-rw-r--r--fs/ext4/inode.c7
-rw-r--r--fs/ext4/namei.c63
-rw-r--r--fs/ext4/super.c14
-rw-r--r--fs/jbd/Kconfig30
-rw-r--r--fs/jbd/Makefile7
-rw-r--r--fs/jbd/checkpoint.c782
-rw-r--r--fs/jbd/commit.c1021
-rw-r--r--fs/jbd/journal.c2145
-rw-r--r--fs/jbd/recovery.c594
-rw-r--r--fs/jbd/revoke.c733
-rw-r--r--fs/jbd/transaction.c2237
-rw-r--r--fs/jfs/file.c7
-rw-r--r--fs/jfs/jfs_inode.c4
-rw-r--r--fs/jfs/namei.c54
-rw-r--r--fs/ocfs2/file.c22
-rw-r--r--fs/ocfs2/namei.c59
-rw-r--r--fs/ocfs2/quota_local.c4
-rw-r--r--fs/ocfs2/refcounttree.c5
-rw-r--r--fs/quota/dquot.c88
-rw-r--r--fs/quota/quota.c4
-rw-r--r--fs/reiserfs/inode.c7
-rw-r--r--fs/reiserfs/namei.c63
-rw-r--r--fs/udf/super.c7
-rw-r--r--include/linux/blk_types.h5
-rw-r--r--include/linux/jbd.h1047
-rw-r--r--include/linux/jbd2.h41
-rw-r--r--include/linux/jbd_common.h46
-rw-r--r--include/linux/quotaops.h5
-rw-r--r--include/trace/events/ext3.h866
-rw-r--r--include/trace/events/jbd.h194
-rw-r--r--mm/Kconfig8
69 files changed, 505 insertions, 28389 deletions
diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl
index bcdfdb9a9277..6006b6358c86 100644
--- a/Documentation/DocBook/filesystems.tmpl
+++ b/Documentation/DocBook/filesystems.tmpl
@@ -146,36 +146,30 @@
146The journalling layer is easy to use. You need to 146The journalling layer is easy to use. You need to
147first of all create a journal_t data structure. There are 147first of all create a journal_t data structure. There are
148two calls to do this dependent on how you decide to allocate the physical 148two calls to do this dependent on how you decide to allocate the physical
149media on which the journal resides. The journal_init_inode() call 149media on which the journal resides. The jbd2_journal_init_inode() call
150is for journals stored in filesystem inodes, or the journal_init_dev() 150is for journals stored in filesystem inodes, or the jbd2_journal_init_dev()
151call can be use for journal stored on a raw device (in a continuous range 151call can be used for journal stored on a raw device (in a continuous range
152of blocks). A journal_t is a typedef for a struct pointer, so when 152of blocks). A journal_t is a typedef for a struct pointer, so when
153you are finally finished make sure you call journal_destroy() on it 153you are finally finished make sure you call jbd2_journal_destroy() on it
154to free up any used kernel memory. 154to free up any used kernel memory.
155</para> 155</para>
156 156
157<para> 157<para>
158Once you have got your journal_t object you need to 'mount' or load the journal 158Once you have got your journal_t object you need to 'mount' or load the journal
159file, unless of course you haven't initialised it yet - in which case you 159file. The journalling layer expects the space for the journal was already
160need to call journal_create(). 160allocated and initialized properly by the userspace tools. When loading the
161journal you must call jbd2_journal_load() to process journal contents. If the
162client file system detects the journal contents does not need to be processed
163(or even need not have valid contents), it may call jbd2_journal_wipe() to
164clear the journal contents before calling jbd2_journal_load().
161</para> 165</para>
162 166
163<para> 167<para>
164Most of the time however your journal file will already have been created, but 168Note that jbd2_journal_wipe(..,0) calls jbd2_journal_skip_recovery() for you if
165before you load it you must call journal_wipe() to empty the journal file. 169it detects any outstanding transactions in the journal and similarly
166Hang on, you say , what if the filesystem wasn't cleanly umount()'d . Well, it is the 170jbd2_journal_load() will call jbd2_journal_recover() if necessary. I would
167job of the client file system to detect this and skip the call to journal_wipe(). 171advise reading ext4_load_journal() in fs/ext4/super.c for examples on this
168</para> 172stage.
169
170<para>
171In either case the next call should be to journal_load() which prepares the
172journal file for use. Note that journal_wipe(..,0) calls journal_skip_recovery()
173for you if it detects any outstanding transactions in the journal and similarly
174journal_load() will call journal_recover() if necessary.
175I would advise reading fs/ext3/super.c for examples on this stage.
176[RGG: Why is the journal_wipe() call necessary - doesn't this needlessly
177complicate the API. Or isn't a good idea for the journal layer to hide
178dirty mounts from the client fs]
179</para> 173</para>
180 174
181<para> 175<para>
@@ -189,41 +183,41 @@ You still need to actually journal your filesystem changes, this
189is done by wrapping them into transactions. Additionally you 183is done by wrapping them into transactions. Additionally you
190also need to wrap the modification of each of the buffers 184also need to wrap the modification of each of the buffers
191with calls to the journal layer, so it knows what the modifications 185with calls to the journal layer, so it knows what the modifications
192you are actually making are. To do this use journal_start() which 186you are actually making are. To do this use jbd2_journal_start() which
193returns a transaction handle. 187returns a transaction handle.
194</para> 188</para>
195 189
196<para> 190<para>
197journal_start() 191jbd2_journal_start()
198and its counterpart journal_stop(), which indicates the end of a transaction 192and its counterpart jbd2_journal_stop(), which indicates the end of a
199are nestable calls, so you can reenter a transaction if necessary, 193transaction are nestable calls, so you can reenter a transaction if necessary,
200but remember you must call journal_stop() the same number of times as 194but remember you must call jbd2_journal_stop() the same number of times as
201journal_start() before the transaction is completed (or more accurately 195jbd2_journal_start() before the transaction is completed (or more accurately
202leaves the update phase). Ext3/VFS makes use of this feature to simplify 196leaves the update phase). Ext4/VFS makes use of this feature to simplify
203quota support. 197handling of inode dirtying, quota support, etc.
204</para> 198</para>
205 199
206<para> 200<para>
207Inside each transaction you need to wrap the modifications to the 201Inside each transaction you need to wrap the modifications to the
208individual buffers (blocks). Before you start to modify a buffer you 202individual buffers (blocks). Before you start to modify a buffer you
209need to call journal_get_{create,write,undo}_access() as appropriate, 203need to call jbd2_journal_get_{create,write,undo}_access() as appropriate,
210this allows the journalling layer to copy the unmodified data if it 204this allows the journalling layer to copy the unmodified data if it
211needs to. After all the buffer may be part of a previously uncommitted 205needs to. After all the buffer may be part of a previously uncommitted
212transaction. 206transaction.
213At this point you are at last ready to modify a buffer, and once 207At this point you are at last ready to modify a buffer, and once
214you are have done so you need to call journal_dirty_{meta,}data(). 208you are have done so you need to call jbd2_journal_dirty_{meta,}data().
215Or if you've asked for access to a buffer you now know is now longer 209Or if you've asked for access to a buffer you now know is now longer
216required to be pushed back on the device you can call journal_forget() 210required to be pushed back on the device you can call jbd2_journal_forget()
217in much the same way as you might have used bforget() in the past. 211in much the same way as you might have used bforget() in the past.
218</para> 212</para>
219 213
220<para> 214<para>
221A journal_flush() may be called at any time to commit and checkpoint 215A jbd2_journal_flush() may be called at any time to commit and checkpoint
222all your transactions. 216all your transactions.
223</para> 217</para>
224 218
225<para> 219<para>
226Then at umount time , in your put_super() you can then call journal_destroy() 220Then at umount time , in your put_super() you can then call jbd2_journal_destroy()
227to clean up your in-core journal object. 221to clean up your in-core journal object.
228</para> 222</para>
229 223
@@ -231,53 +225,68 @@ to clean up your in-core journal object.
231Unfortunately there a couple of ways the journal layer can cause a deadlock. 225Unfortunately there a couple of ways the journal layer can cause a deadlock.
232The first thing to note is that each task can only have 226The first thing to note is that each task can only have
233a single outstanding transaction at any one time, remember nothing 227a single outstanding transaction at any one time, remember nothing
234commits until the outermost journal_stop(). This means 228commits until the outermost jbd2_journal_stop(). This means
235you must complete the transaction at the end of each file/inode/address 229you must complete the transaction at the end of each file/inode/address
236etc. operation you perform, so that the journalling system isn't re-entered 230etc. operation you perform, so that the journalling system isn't re-entered
237on another journal. Since transactions can't be nested/batched 231on another journal. Since transactions can't be nested/batched
238across differing journals, and another filesystem other than 232across differing journals, and another filesystem other than
239yours (say ext3) may be modified in a later syscall. 233yours (say ext4) may be modified in a later syscall.
240</para> 234</para>
241 235
242<para> 236<para>
243The second case to bear in mind is that journal_start() can 237The second case to bear in mind is that jbd2_journal_start() can
244block if there isn't enough space in the journal for your transaction 238block if there isn't enough space in the journal for your transaction
245(based on the passed nblocks param) - when it blocks it merely(!) needs to 239(based on the passed nblocks param) - when it blocks it merely(!) needs to
246wait for transactions to complete and be committed from other tasks, 240wait for transactions to complete and be committed from other tasks,
247so essentially we are waiting for journal_stop(). So to avoid 241so essentially we are waiting for jbd2_journal_stop(). So to avoid
248deadlocks you must treat journal_start/stop() as if they 242deadlocks you must treat jbd2_journal_start/stop() as if they
249were semaphores and include them in your semaphore ordering rules to prevent 243were semaphores and include them in your semaphore ordering rules to prevent
250deadlocks. Note that journal_extend() has similar blocking behaviour to 244deadlocks. Note that jbd2_journal_extend() has similar blocking behaviour to
251journal_start() so you can deadlock here just as easily as on journal_start(). 245jbd2_journal_start() so you can deadlock here just as easily as on
246jbd2_journal_start().
252</para> 247</para>
253 248
254<para> 249<para>
255Try to reserve the right number of blocks the first time. ;-). This will 250Try to reserve the right number of blocks the first time. ;-). This will
256be the maximum number of blocks you are going to touch in this transaction. 251be the maximum number of blocks you are going to touch in this transaction.
257I advise having a look at at least ext3_jbd.h to see the basis on which 252I advise having a look at at least ext4_jbd.h to see the basis on which
258ext3 uses to make these decisions. 253ext4 uses to make these decisions.
259</para> 254</para>
260 255
261<para> 256<para>
262Another wriggle to watch out for is your on-disk block allocation strategy. 257Another wriggle to watch out for is your on-disk block allocation strategy.
263why? Because, if you undo a delete, you need to ensure you haven't reused any 258Why? Because, if you do a delete, you need to ensure you haven't reused any
264of the freed blocks in a later transaction. One simple way of doing this 259of the freed blocks until the transaction freeing these blocks commits. If you
265is make sure any blocks you allocate only have checkpointed transactions 260reused these blocks and crash happens, there is no way to restore the contents
266listed against them. Ext3 does this in ext3_test_allocatable(). 261of the reallocated blocks at the end of the last fully committed transaction.
262
263One simple way of doing this is to mark blocks as free in internal in-memory
264block allocation structures only after the transaction freeing them commits.
265Ext4 uses journal commit callback for this purpose.
266</para>
267
268<para>
269With journal commit callbacks you can ask the journalling layer to call a
270callback function when the transaction is finally committed to disk, so that
271you can do some of your own management. You ask the journalling layer for
272calling the callback by simply setting journal->j_commit_callback function
273pointer and that function is called after each transaction commit. You can also
274use transaction->t_private_list for attaching entries to a transaction that
275need processing when the transaction commits.
267</para> 276</para>
268 277
269<para> 278<para>
270Lock is also providing through journal_{un,}lock_updates(), 279JBD2 also provides a way to block all transaction updates via
271ext3 uses this when it wants a window with a clean and stable fs for a moment. 280jbd2_journal_{un,}lock_updates(). Ext4 uses this when it wants a window with a
272eg. 281clean and stable fs for a moment. E.g.
273</para> 282</para>
274 283
275<programlisting> 284<programlisting>
276 285
277 journal_lock_updates() //stop new stuff happening.. 286 jbd2_journal_lock_updates() //stop new stuff happening..
278 journal_flush() // checkpoint everything. 287 jbd2_journal_flush() // checkpoint everything.
279 ..do stuff on stable fs 288 ..do stuff on stable fs
280 journal_unlock_updates() // carry on with filesystem use. 289 jbd2_journal_unlock_updates() // carry on with filesystem use.
281</programlisting> 290</programlisting>
282 291
283<para> 292<para>
@@ -286,29 +295,6 @@ if you allow unprivileged userspace to trigger codepaths containing these
286calls. 295calls.
287</para> 296</para>
288 297
289<para>
290A new feature of jbd since 2.5.25 is commit callbacks with the new
291journal_callback_set() function you can now ask the journalling layer
292to call you back when the transaction is finally committed to disk, so that
293you can do some of your own management. The key to this is the journal_callback
294struct, this maintains the internal callback information but you can
295extend it like this:-
296</para>
297<programlisting>
298 struct myfs_callback_s {
299 //Data structure element required by jbd..
300 struct journal_callback for_jbd;
301 // Stuff for myfs allocated together.
302 myfs_inode* i_commited;
303
304 }
305</programlisting>
306
307<para>
308this would be useful if you needed to know when data was committed to a
309particular inode.
310</para>
311
312 </sect2> 298 </sect2>
313 299
314 <sect2 id="jbd_summary"> 300 <sect2 id="jbd_summary">
@@ -319,36 +305,6 @@ being each mount, each modification (transaction) and each changed buffer
319to tell the journalling layer about them. 305to tell the journalling layer about them.
320</para> 306</para>
321 307
322<para>
323Here is a some pseudo code to give you an idea of how it works, as
324an example.
325</para>
326
327<programlisting>
328 journal_t* my_jnrl = journal_create();
329 journal_init_{dev,inode}(jnrl,...)
330 if (clean) journal_wipe();
331 journal_load();
332
333 foreach(transaction) { /*transactions must be
334 completed before
335 a syscall returns to
336 userspace*/
337
338 handle_t * xct=journal_start(my_jnrl);
339 foreach(bh) {
340 journal_get_{create,write,undo}_access(xact,bh);
341 if ( myfs_modify(bh) ) { /* returns true
342 if makes changes */
343 journal_dirty_{meta,}data(xact,bh);
344 } else {
345 journal_forget(bh);
346 }
347 }
348 journal_stop(xct);
349 }
350 journal_destroy(my_jrnl);
351</programlisting>
352 </sect2> 308 </sect2>
353 309
354 </sect1> 310 </sect1>
@@ -357,13 +313,13 @@ an example.
357 <title>Data Types</title> 313 <title>Data Types</title>
358 <para> 314 <para>
359 The journalling layer uses typedefs to 'hide' the concrete definitions 315 The journalling layer uses typedefs to 'hide' the concrete definitions
360 of the structures used. As a client of the JBD layer you can 316 of the structures used. As a client of the JBD2 layer you can
361 just rely on the using the pointer as a magic cookie of some sort. 317 just rely on the using the pointer as a magic cookie of some sort.
362 318
363 Obviously the hiding is not enforced as this is 'C'. 319 Obviously the hiding is not enforced as this is 'C'.
364 </para> 320 </para>
365 <sect2 id="structures"><title>Structures</title> 321 <sect2 id="structures"><title>Structures</title>
366!Iinclude/linux/jbd.h 322!Iinclude/linux/jbd2.h
367 </sect2> 323 </sect2>
368 </sect1> 324 </sect1>
369 325
@@ -375,11 +331,11 @@ an example.
375 manage transactions 331 manage transactions
376 </para> 332 </para>
377 <sect2 id="journal_level"><title>Journal Level</title> 333 <sect2 id="journal_level"><title>Journal Level</title>
378!Efs/jbd/journal.c 334!Efs/jbd2/journal.c
379!Ifs/jbd/recovery.c 335!Ifs/jbd2/recovery.c
380 </sect2> 336 </sect2>
381 <sect2 id="transaction_level"><title>Transasction Level</title> 337 <sect2 id="transaction_level"><title>Transasction Level</title>
382!Efs/jbd/transaction.c 338!Efs/jbd2/transaction.c
383 </sect2> 339 </sect2>
384 </sect1> 340 </sect1>
385 <sect1 id="see_also"> 341 <sect1 id="see_also">
diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt
index b9714569e472..55755395d3dc 100644
--- a/Documentation/filesystems/ext2.txt
+++ b/Documentation/filesystems/ext2.txt
@@ -360,8 +360,8 @@ and are copied into the filesystem. If a transaction is incomplete at
360the time of the crash, then there is no guarantee of consistency for 360the time of the crash, then there is no guarantee of consistency for
361the blocks in that transaction so they are discarded (which means any 361the blocks in that transaction so they are discarded (which means any
362filesystem changes they represent are also lost). 362filesystem changes they represent are also lost).
363Check Documentation/filesystems/ext3.txt if you want to read more about 363Check Documentation/filesystems/ext4.txt if you want to read more about
364ext3 and journaling. 364ext4 and journaling.
365 365
366References 366References
367========== 367==========
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 7ed0d17d6721..58758fbef9e0 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -6,210 +6,7 @@ Ext3 was originally released in September 1999. Written by Stephen Tweedie
6for the 2.2 branch, and ported to 2.4 kernels by Peter Braam, Andreas Dilger, 6for the 2.2 branch, and ported to 2.4 kernels by Peter Braam, Andreas Dilger,
7Andrew Morton, Alexander Viro, Ted Ts'o and Stephen Tweedie. 7Andrew Morton, Alexander Viro, Ted Ts'o and Stephen Tweedie.
8 8
9Ext3 is the ext2 filesystem enhanced with journalling capabilities. 9Ext3 is the ext2 filesystem enhanced with journalling capabilities. The
10filesystem is a subset of ext4 filesystem so use ext4 driver for accessing
11ext3 filesystems.
10 12
11Options
12=======
13
14When mounting an ext3 filesystem, the following option are accepted:
15(*) == default
16
17ro Mount filesystem read only. Note that ext3 will replay
18 the journal (and thus write to the partition) even when
19 mounted "read only". Mount options "ro,noload" can be
20 used to prevent writes to the filesystem.
21
22journal=update Update the ext3 file system's journal to the current
23 format.
24
25journal=inum When a journal already exists, this option is ignored.
26 Otherwise, it specifies the number of the inode which
27 will represent the ext3 file system's journal file.
28
29journal_path=path
30journal_dev=devnum When the external journal device's major/minor numbers
31 have changed, these options allow the user to specify
32 the new journal location. The journal device is
33 identified through either its new major/minor numbers
34 encoded in devnum, or via a path to the device.
35
36norecovery Don't load the journal on mounting. Note that this forces
37noload mount of inconsistent filesystem, which can lead to
38 various problems.
39
40data=journal All data are committed into the journal prior to being
41 written into the main file system.
42
43data=ordered (*) All data are forced directly out to the main file
44 system prior to its metadata being committed to the
45 journal.
46
47data=writeback Data ordering is not preserved, data may be written
48 into the main file system after its metadata has been
49 committed to the journal.
50
51commit=nrsec (*) Ext3 can be told to sync all its data and metadata
52 every 'nrsec' seconds. The default value is 5 seconds.
53 This means that if you lose your power, you will lose
54 as much as the latest 5 seconds of work (your
55 filesystem will not be damaged though, thanks to the
56 journaling). This default value (or any low value)
57 will hurt performance, but it's good for data-safety.
58 Setting it to 0 will have the same effect as leaving
59 it at the default (5 seconds).
60 Setting it to very large values will improve
61 performance.
62
63barrier=<0|1(*)> This enables/disables the use of write barriers in
64barrier (*) the jbd code. barrier=0 disables, barrier=1 enables.
65nobarrier This also requires an IO stack which can support
66 barriers, and if jbd gets an error on a barrier
67 write, it will disable again with a warning.
68 Write barriers enforce proper on-disk ordering
69 of journal commits, making volatile disk write caches
70 safe to use, at some performance penalty. If
71 your disks are battery-backed in one way or another,
72 disabling barriers may safely improve performance.
73 The mount options "barrier" and "nobarrier" can
74 also be used to enable or disable barriers, for
75 consistency with other ext3 mount options.
76
77user_xattr Enables Extended User Attributes. Additionally, you
78 need to have extended attribute support enabled in the
79 kernel configuration (CONFIG_EXT3_FS_XATTR). See the
80 attr(5) manual page and http://acl.bestbits.at/ to
81 learn more about extended attributes.
82
83nouser_xattr Disables Extended User Attributes.
84
85acl Enables POSIX Access Control Lists support.
86 Additionally, you need to have ACL support enabled in
87 the kernel configuration (CONFIG_EXT3_FS_POSIX_ACL).
88 See the acl(5) manual page and http://acl.bestbits.at/
89 for more information.
90
91noacl This option disables POSIX Access Control List
92 support.
93
94reservation
95
96noreservation
97
98bsddf (*) Make 'df' act like BSD.
99minixdf Make 'df' act like Minix.
100
101check=none Don't do extra checking of bitmaps on mount.
102nocheck
103
104debug Extra debugging information is sent to syslog.
105
106errors=remount-ro Remount the filesystem read-only on an error.
107errors=continue Keep going on a filesystem error.
108errors=panic Panic and halt the machine if an error occurs.
109 (These mount options override the errors behavior
110 specified in the superblock, which can be
111 configured using tune2fs.)
112
113data_err=ignore(*) Just print an error message if an error occurs
114 in a file data buffer in ordered mode.
115data_err=abort Abort the journal if an error occurs in a file
116 data buffer in ordered mode.
117
118grpid Give objects the same group ID as their creator.
119bsdgroups
120
121nogrpid (*) New objects have the group ID of their creator.
122sysvgroups
123
124resgid=n The group ID which may use the reserved blocks.
125
126resuid=n The user ID which may use the reserved blocks.
127
128sb=n Use alternate superblock at this location.
129
130quota These options are ignored by the filesystem. They
131noquota are used only by quota tools to recognize volumes
132grpquota where quota should be turned on. See documentation
133usrquota in the quota-tools package for more details
134 (http://sourceforge.net/projects/linuxquota).
135
136jqfmt=<quota type> These options tell filesystem details about quota
137usrjquota=<file> so that quota information can be properly updated
138grpjquota=<file> during journal replay. They replace the above
139 quota options. See documentation in the quota-tools
140 package for more details
141 (http://sourceforge.net/projects/linuxquota).
142
143Specification
144=============
145Ext3 shares all disk implementation with the ext2 filesystem, and adds
146transactions capabilities to ext2. Journaling is done by the Journaling Block
147Device layer.
148
149Journaling Block Device layer
150-----------------------------
151The Journaling Block Device layer (JBD) isn't ext3 specific. It was designed
152to add journaling capabilities to a block device. The ext3 filesystem code
153will inform the JBD of modifications it is performing (called a transaction).
154The journal supports the transactions start and stop, and in case of a crash,
155the journal can replay the transactions to quickly put the partition back into
156a consistent state.
157
158Handles represent a single atomic update to a filesystem. JBD can handle an
159external journal on a block device.
160
161Data Mode
162---------
163There are 3 different data modes:
164
165* writeback mode
166In data=writeback mode, ext3 does not journal data at all. This mode provides
167a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
168mode - metadata journaling. A crash+recovery can cause incorrect data to
169appear in files which were written shortly before the crash. This mode will
170typically provide the best ext3 performance.
171
172* ordered mode
173In data=ordered mode, ext3 only officially journals metadata, but it logically
174groups metadata and data blocks into a single unit called a transaction. When
175it's time to write the new metadata out to disk, the associated data blocks
176are written first. In general, this mode performs slightly slower than
177writeback but significantly faster than journal mode.
178
179* journal mode
180data=journal mode provides full data and metadata journaling. All new data is
181written to the journal first, and then to its final location.
182In the event of a crash, the journal can be replayed, bringing both data and
183metadata into a consistent state. This mode is the slowest except when data
184needs to be read from and written to disk at the same time where it
185outperforms all other modes.
186
187Compatibility
188-------------
189
190Ext2 partitions can be easily convert to ext3, with `tune2fs -j <dev>`.
191Ext3 is fully compatible with Ext2. Ext3 partitions can easily be mounted as
192Ext2.
193
194
195External Tools
196==============
197See manual pages to learn more.
198
199tune2fs: create a ext3 journal on a ext2 partition with the -j flag.
200mke2fs: create a ext3 partition with the -j flag.
201debugfs: ext2 and ext3 file system debugger.
202ext2online: online (mounted) ext2 and ext3 filesystem resizer
203
204
205References
206==========
207
208kernel source: <file:fs/ext3/>
209 <file:fs/jbd/>
210
211programs: http://e2fsprogs.sourceforge.net/
212 http://ext2resize.sourceforge.net
213
214useful links: http://www.ibm.com/developerworks/library/l-fs7/index.html
215 http://www.ibm.com/developerworks/library/l-fs8/index.html
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 5eb8456fc41e..8c6f07ad373a 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -769,7 +769,7 @@ struct address_space_operations {
769 to stall to allow flushers a chance to complete some IO. Ordinarily 769 to stall to allow flushers a chance to complete some IO. Ordinarily
770 it can use PageDirty and PageWriteback but some filesystems have 770 it can use PageDirty and PageWriteback but some filesystems have
771 more complex state (unstable pages in NFS prevent reclaim) or 771 more complex state (unstable pages in NFS prevent reclaim) or
772 do not set those flags due to locking problems (jbd). This callback 772 do not set those flags due to locking problems. This callback
773 allows a filesystem to indicate to the VM if a page should be 773 allows a filesystem to indicate to the VM if a page should be
774 treated as dirty or writeback for the purposes of stalling. 774 treated as dirty or writeback for the purposes of stalling.
775 775
diff --git a/MAINTAINERS b/MAINTAINERS
index cb5e81811c81..73db93cc55fd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4078,15 +4078,6 @@ F: Documentation/filesystems/ext2.txt
4078F: fs/ext2/ 4078F: fs/ext2/
4079F: include/linux/ext2* 4079F: include/linux/ext2*
4080 4080
4081EXT3 FILE SYSTEM
4082M: Jan Kara <jack@suse.com>
4083M: Andrew Morton <akpm@linux-foundation.org>
4084M: Andreas Dilger <adilger.kernel@dilger.ca>
4085L: linux-ext4@vger.kernel.org
4086S: Maintained
4087F: Documentation/filesystems/ext3.txt
4088F: fs/ext3/
4089
4090EXT4 FILE SYSTEM 4081EXT4 FILE SYSTEM
4091M: "Theodore Ts'o" <tytso@mit.edu> 4082M: "Theodore Ts'o" <tytso@mit.edu>
4092M: Andreas Dilger <adilger.kernel@dilger.ca> 4083M: Andreas Dilger <adilger.kernel@dilger.ca>
@@ -5787,16 +5778,9 @@ S: Maintained
5787F: fs/jffs2/ 5778F: fs/jffs2/
5788F: include/uapi/linux/jffs2.h 5779F: include/uapi/linux/jffs2.h
5789 5780
5790JOURNALLING LAYER FOR BLOCK DEVICES (JBD)
5791M: Andrew Morton <akpm@linux-foundation.org>
5792M: Jan Kara <jack@suse.com>
5793L: linux-ext4@vger.kernel.org
5794S: Maintained
5795F: fs/jbd/
5796F: include/linux/jbd.h
5797
5798JOURNALLING LAYER FOR BLOCK DEVICES (JBD2) 5781JOURNALLING LAYER FOR BLOCK DEVICES (JBD2)
5799M: "Theodore Ts'o" <tytso@mit.edu> 5782M: "Theodore Ts'o" <tytso@mit.edu>
5783M: Jan Kara <jack@suse.com>
5800L: linux-ext4@vger.kernel.org 5784L: linux-ext4@vger.kernel.org
5801S: Maintained 5785S: Maintained
5802F: fs/jbd2/ 5786F: fs/jbd2/
diff --git a/block/bounce.c b/block/bounce.c
index 2c310ea007ee..0611aea1cfe9 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -177,26 +177,8 @@ static void bounce_end_io_read_isa(struct bio *bio)
177 __bounce_end_io_read(bio, isa_page_pool); 177 __bounce_end_io_read(bio, isa_page_pool);
178} 178}
179 179
180#ifdef CONFIG_NEED_BOUNCE_POOL
181static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
182{
183 if (bio_data_dir(bio) != WRITE)
184 return 0;
185
186 if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
187 return 0;
188
189 return bio_flagged(bio, BIO_SNAP_STABLE);
190}
191#else
192static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
193{
194 return 0;
195}
196#endif /* CONFIG_NEED_BOUNCE_POOL */
197
198static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, 180static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
199 mempool_t *pool, int force) 181 mempool_t *pool)
200{ 182{
201 struct bio *bio; 183 struct bio *bio;
202 int rw = bio_data_dir(*bio_orig); 184 int rw = bio_data_dir(*bio_orig);
@@ -204,8 +186,6 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
204 struct bvec_iter iter; 186 struct bvec_iter iter;
205 unsigned i; 187 unsigned i;
206 188
207 if (force)
208 goto bounce;
209 bio_for_each_segment(from, *bio_orig, iter) 189 bio_for_each_segment(from, *bio_orig, iter)
210 if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q)) 190 if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q))
211 goto bounce; 191 goto bounce;
@@ -217,7 +197,7 @@ bounce:
217 bio_for_each_segment_all(to, bio, i) { 197 bio_for_each_segment_all(to, bio, i) {
218 struct page *page = to->bv_page; 198 struct page *page = to->bv_page;
219 199
220 if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) 200 if (page_to_pfn(page) <= queue_bounce_pfn(q))
221 continue; 201 continue;
222 202
223 to->bv_page = mempool_alloc(pool, q->bounce_gfp); 203 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
@@ -255,7 +235,6 @@ bounce:
255 235
256void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) 236void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
257{ 237{
258 int must_bounce;
259 mempool_t *pool; 238 mempool_t *pool;
260 239
261 /* 240 /*
@@ -264,15 +243,13 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
264 if (!bio_has_data(*bio_orig)) 243 if (!bio_has_data(*bio_orig))
265 return; 244 return;
266 245
267 must_bounce = must_snapshot_stable_pages(q, *bio_orig);
268
269 /* 246 /*
270 * for non-isa bounce case, just check if the bounce pfn is equal 247 * for non-isa bounce case, just check if the bounce pfn is equal
271 * to or bigger than the highest pfn in the system -- in that case, 248 * to or bigger than the highest pfn in the system -- in that case,
272 * don't waste time iterating over bio segments 249 * don't waste time iterating over bio segments
273 */ 250 */
274 if (!(q->bounce_gfp & GFP_DMA)) { 251 if (!(q->bounce_gfp & GFP_DMA)) {
275 if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce) 252 if (queue_bounce_pfn(q) >= blk_max_pfn)
276 return; 253 return;
277 pool = page_pool; 254 pool = page_pool;
278 } else { 255 } else {
@@ -283,7 +260,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
283 /* 260 /*
284 * slow path 261 * slow path
285 */ 262 */
286 __blk_queue_bounce(q, bio_orig, pool, must_bounce); 263 __blk_queue_bounce(q, bio_orig, pool);
287} 264}
288 265
289EXPORT_SYMBOL(blk_queue_bounce); 266EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/fs/Kconfig b/fs/Kconfig
index 011f43365d7b..da3f32f1a4e4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -11,18 +11,15 @@ config DCACHE_WORD_ACCESS
11if BLOCK 11if BLOCK
12 12
13source "fs/ext2/Kconfig" 13source "fs/ext2/Kconfig"
14source "fs/ext3/Kconfig"
15source "fs/ext4/Kconfig" 14source "fs/ext4/Kconfig"
16source "fs/jbd/Kconfig"
17source "fs/jbd2/Kconfig" 15source "fs/jbd2/Kconfig"
18 16
19config FS_MBCACHE 17config FS_MBCACHE
20# Meta block cache for Extended Attributes (ext2/ext3/ext4) 18# Meta block cache for Extended Attributes (ext2/ext3/ext4)
21 tristate 19 tristate
22 default y if EXT2_FS=y && EXT2_FS_XATTR 20 default y if EXT2_FS=y && EXT2_FS_XATTR
23 default y if EXT3_FS=y && EXT3_FS_XATTR
24 default y if EXT4_FS=y 21 default y if EXT4_FS=y
25 default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS 22 default m if EXT2_FS_XATTR || EXT4_FS
26 23
27source "fs/reiserfs/Kconfig" 24source "fs/reiserfs/Kconfig"
28source "fs/jfs/Kconfig" 25source "fs/jfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index cb20e4bf2303..09e051fefc5b 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -62,12 +62,10 @@ obj-$(CONFIG_DLM) += dlm/
62# Do not add any filesystems before this line 62# Do not add any filesystems before this line
63obj-$(CONFIG_FSCACHE) += fscache/ 63obj-$(CONFIG_FSCACHE) += fscache/
64obj-$(CONFIG_REISERFS_FS) += reiserfs/ 64obj-$(CONFIG_REISERFS_FS) += reiserfs/
65obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
66obj-$(CONFIG_EXT2_FS) += ext2/ 65obj-$(CONFIG_EXT2_FS) += ext2/
67# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2 66# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
68# unless explicitly requested by rootfstype 67# unless explicitly requested by rootfstype
69obj-$(CONFIG_EXT4_FS) += ext4/ 68obj-$(CONFIG_EXT4_FS) += ext4/
70obj-$(CONFIG_JBD) += jbd/
71obj-$(CONFIG_JBD2) += jbd2/ 69obj-$(CONFIG_JBD2) += jbd2/
72obj-$(CONFIG_CRAMFS) += cramfs/ 70obj-$(CONFIG_CRAMFS) += cramfs/
73obj-$(CONFIG_SQUASHFS) += squashfs/ 71obj-$(CONFIG_SQUASHFS) += squashfs/
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 5c04a0ddea80..efe5fb21c533 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -577,7 +577,10 @@ got:
577 goto fail; 577 goto fail;
578 } 578 }
579 579
580 dquot_initialize(inode); 580 err = dquot_initialize(inode);
581 if (err)
582 goto fail_drop;
583
581 err = dquot_alloc_inode(inode); 584 err = dquot_alloc_inode(inode);
582 if (err) 585 if (err)
583 goto fail_drop; 586 goto fail_drop;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 5c09776d347f..a3a404c5df2e 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1552,8 +1552,11 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1552 if (error) 1552 if (error)
1553 return error; 1553 return error;
1554 1554
1555 if (is_quota_modification(inode, iattr)) 1555 if (is_quota_modification(inode, iattr)) {
1556 dquot_initialize(inode); 1556 error = dquot_initialize(inode);
1557 if (error)
1558 return error;
1559 }
1557 if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || 1560 if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
1558 (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { 1561 (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
1559 error = dquot_transfer(inode, iattr); 1562 error = dquot_transfer(inode, iattr);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 13ec54a99c96..b4841e3066a5 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -96,8 +96,11 @@ struct dentry *ext2_get_parent(struct dentry *child)
96static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) 96static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl)
97{ 97{
98 struct inode *inode; 98 struct inode *inode;
99 int err;
99 100
100 dquot_initialize(dir); 101 err = dquot_initialize(dir);
102 if (err)
103 return err;
101 104
102 inode = ext2_new_inode(dir, mode, &dentry->d_name); 105 inode = ext2_new_inode(dir, mode, &dentry->d_name);
103 if (IS_ERR(inode)) 106 if (IS_ERR(inode))
@@ -143,7 +146,9 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode,
143 if (!new_valid_dev(rdev)) 146 if (!new_valid_dev(rdev))
144 return -EINVAL; 147 return -EINVAL;
145 148
146 dquot_initialize(dir); 149 err = dquot_initialize(dir);
150 if (err)
151 return err;
147 152
148 inode = ext2_new_inode (dir, mode, &dentry->d_name); 153 inode = ext2_new_inode (dir, mode, &dentry->d_name);
149 err = PTR_ERR(inode); 154 err = PTR_ERR(inode);
@@ -169,7 +174,9 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
169 if (l > sb->s_blocksize) 174 if (l > sb->s_blocksize)
170 goto out; 175 goto out;
171 176
172 dquot_initialize(dir); 177 err = dquot_initialize(dir);
178 if (err)
179 goto out;
173 180
174 inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name); 181 inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name);
175 err = PTR_ERR(inode); 182 err = PTR_ERR(inode);
@@ -212,7 +219,9 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
212 struct inode *inode = d_inode(old_dentry); 219 struct inode *inode = d_inode(old_dentry);
213 int err; 220 int err;
214 221
215 dquot_initialize(dir); 222 err = dquot_initialize(dir);
223 if (err)
224 return err;
216 225
217 inode->i_ctime = CURRENT_TIME_SEC; 226 inode->i_ctime = CURRENT_TIME_SEC;
218 inode_inc_link_count(inode); 227 inode_inc_link_count(inode);
@@ -233,7 +242,9 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
233 struct inode * inode; 242 struct inode * inode;
234 int err; 243 int err;
235 244
236 dquot_initialize(dir); 245 err = dquot_initialize(dir);
246 if (err)
247 return err;
237 248
238 inode_inc_link_count(dir); 249 inode_inc_link_count(dir);
239 250
@@ -279,13 +290,17 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry)
279 struct inode * inode = d_inode(dentry); 290 struct inode * inode = d_inode(dentry);
280 struct ext2_dir_entry_2 * de; 291 struct ext2_dir_entry_2 * de;
281 struct page * page; 292 struct page * page;
282 int err = -ENOENT; 293 int err;
283 294
284 dquot_initialize(dir); 295 err = dquot_initialize(dir);
296 if (err)
297 goto out;
285 298
286 de = ext2_find_entry (dir, &dentry->d_name, &page); 299 de = ext2_find_entry (dir, &dentry->d_name, &page);
287 if (!de) 300 if (!de) {
301 err = -ENOENT;
288 goto out; 302 goto out;
303 }
289 304
290 err = ext2_delete_entry (de, page); 305 err = ext2_delete_entry (de, page);
291 if (err) 306 if (err)
@@ -323,14 +338,21 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
323 struct ext2_dir_entry_2 * dir_de = NULL; 338 struct ext2_dir_entry_2 * dir_de = NULL;
324 struct page * old_page; 339 struct page * old_page;
325 struct ext2_dir_entry_2 * old_de; 340 struct ext2_dir_entry_2 * old_de;
326 int err = -ENOENT; 341 int err;
342
343 err = dquot_initialize(old_dir);
344 if (err)
345 goto out;
327 346
328 dquot_initialize(old_dir); 347 err = dquot_initialize(new_dir);
329 dquot_initialize(new_dir); 348 if (err)
349 goto out;
330 350
331 old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page); 351 old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page);
332 if (!old_de) 352 if (!old_de) {
353 err = -ENOENT;
333 goto out; 354 goto out;
355 }
334 356
335 if (S_ISDIR(old_inode->i_mode)) { 357 if (S_ISDIR(old_inode->i_mode)) {
336 err = -EIO; 358 err = -EIO;
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
deleted file mode 100644
index e8c6ba0e4a3e..000000000000
--- a/fs/ext3/Kconfig
+++ /dev/null
@@ -1,89 +0,0 @@
1config EXT3_FS
2 tristate "Ext3 journalling file system support"
3 select JBD
4 help
5 This is the journalling version of the Second extended file system
6 (often called ext3), the de facto standard Linux file system
7 (method to organize files on a storage device) for hard disks.
8
9 The journalling code included in this driver means you do not have
10 to run e2fsck (file system checker) on your file systems after a
11 crash. The journal keeps track of any changes that were being made
12 at the time the system crashed, and can ensure that your file system
13 is consistent without the need for a lengthy check.
14
15 Other than adding the journal to the file system, the on-disk format
16 of ext3 is identical to ext2. It is possible to freely switch
17 between using the ext3 driver and the ext2 driver, as long as the
18 file system has been cleanly unmounted, or e2fsck is run on the file
19 system.
20
21 To add a journal on an existing ext2 file system or change the
22 behavior of ext3 file systems, you can use the tune2fs utility ("man
23 tune2fs"). To modify attributes of files and directories on ext3
24 file systems, use chattr ("man chattr"). You need to be using
25 e2fsprogs version 1.20 or later in order to create ext3 journals
26 (available at <http://sourceforge.net/projects/e2fsprogs/>).
27
28 To compile this file system support as a module, choose M here: the
29 module will be called ext3.
30
31config EXT3_DEFAULTS_TO_ORDERED
32 bool "Default to 'data=ordered' in ext3"
33 depends on EXT3_FS
34 default y
35 help
36 The journal mode options for ext3 have different tradeoffs
37 between when data is guaranteed to be on disk and
38 performance. The use of "data=writeback" can cause
39 unwritten data to appear in files after an system crash or
40 power failure, which can be a security issue. However,
41 "data=ordered" mode can also result in major performance
42 problems, including seconds-long delays before an fsync()
43 call returns. For details, see:
44
45 http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs
46
47 If you have been historically happy with ext3's performance,
48 data=ordered mode will be a safe choice and you should
49 answer 'y' here. If you understand the reliability and data
50 privacy issues of data=writeback and are willing to make
51 that trade off, answer 'n'.
52
53config EXT3_FS_XATTR
54 bool "Ext3 extended attributes"
55 depends on EXT3_FS
56 default y
57 help
58 Extended attributes are name:value pairs associated with inodes by
59 the kernel or by users (see the attr(5) manual page, or visit
60 <http://acl.bestbits.at/> for details).
61
62 If unsure, say N.
63
64 You need this for POSIX ACL support on ext3.
65
66config EXT3_FS_POSIX_ACL
67 bool "Ext3 POSIX Access Control Lists"
68 depends on EXT3_FS_XATTR
69 select FS_POSIX_ACL
70 help
71 Posix Access Control Lists (ACLs) support permissions for users and
72 groups beyond the owner/group/world scheme.
73
74 To learn more about Access Control Lists, visit the Posix ACLs for
75 Linux website <http://acl.bestbits.at/>.
76
77 If you don't know what Access Control Lists are, say N
78
79config EXT3_FS_SECURITY
80 bool "Ext3 Security Labels"
81 depends on EXT3_FS_XATTR
82 help
83 Security labels support alternative access control models
84 implemented by security modules like SELinux. This option
85 enables an extended attribute handler for file security
86 labels in the ext3 filesystem.
87
88 If you are not using a security module that requires using
89 extended attributes for file security labels, say N.
diff --git a/fs/ext3/Makefile b/fs/ext3/Makefile
deleted file mode 100644
index e77766a8b3f0..000000000000
--- a/fs/ext3/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
1#
2# Makefile for the linux ext3-filesystem routines.
3#
4
5obj-$(CONFIG_EXT3_FS) += ext3.o
6
7ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o ext3_jbd.o
9
10ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
11ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
12ext3-$(CONFIG_EXT3_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
deleted file mode 100644
index 8bbaf5bcf982..000000000000
--- a/fs/ext3/acl.c
+++ /dev/null
@@ -1,281 +0,0 @@
1/*
2 * linux/fs/ext3/acl.c
3 *
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 */
6
7#include "ext3.h"
8#include "xattr.h"
9#include "acl.h"
10
11/*
12 * Convert from filesystem to in-memory representation.
13 */
14static struct posix_acl *
15ext3_acl_from_disk(const void *value, size_t size)
16{
17 const char *end = (char *)value + size;
18 int n, count;
19 struct posix_acl *acl;
20
21 if (!value)
22 return NULL;
23 if (size < sizeof(ext3_acl_header))
24 return ERR_PTR(-EINVAL);
25 if (((ext3_acl_header *)value)->a_version !=
26 cpu_to_le32(EXT3_ACL_VERSION))
27 return ERR_PTR(-EINVAL);
28 value = (char *)value + sizeof(ext3_acl_header);
29 count = ext3_acl_count(size);
30 if (count < 0)
31 return ERR_PTR(-EINVAL);
32 if (count == 0)
33 return NULL;
34 acl = posix_acl_alloc(count, GFP_NOFS);
35 if (!acl)
36 return ERR_PTR(-ENOMEM);
37 for (n=0; n < count; n++) {
38 ext3_acl_entry *entry =
39 (ext3_acl_entry *)value;
40 if ((char *)value + sizeof(ext3_acl_entry_short) > end)
41 goto fail;
42 acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
43 acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
44 switch(acl->a_entries[n].e_tag) {
45 case ACL_USER_OBJ:
46 case ACL_GROUP_OBJ:
47 case ACL_MASK:
48 case ACL_OTHER:
49 value = (char *)value +
50 sizeof(ext3_acl_entry_short);
51 break;
52
53 case ACL_USER:
54 value = (char *)value + sizeof(ext3_acl_entry);
55 if ((char *)value > end)
56 goto fail;
57 acl->a_entries[n].e_uid =
58 make_kuid(&init_user_ns,
59 le32_to_cpu(entry->e_id));
60 break;
61 case ACL_GROUP:
62 value = (char *)value + sizeof(ext3_acl_entry);
63 if ((char *)value > end)
64 goto fail;
65 acl->a_entries[n].e_gid =
66 make_kgid(&init_user_ns,
67 le32_to_cpu(entry->e_id));
68 break;
69
70 default:
71 goto fail;
72 }
73 }
74 if (value != end)
75 goto fail;
76 return acl;
77
78fail:
79 posix_acl_release(acl);
80 return ERR_PTR(-EINVAL);
81}
82
83/*
84 * Convert from in-memory to filesystem representation.
85 */
86static void *
87ext3_acl_to_disk(const struct posix_acl *acl, size_t *size)
88{
89 ext3_acl_header *ext_acl;
90 char *e;
91 size_t n;
92
93 *size = ext3_acl_size(acl->a_count);
94 ext_acl = kmalloc(sizeof(ext3_acl_header) + acl->a_count *
95 sizeof(ext3_acl_entry), GFP_NOFS);
96 if (!ext_acl)
97 return ERR_PTR(-ENOMEM);
98 ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION);
99 e = (char *)ext_acl + sizeof(ext3_acl_header);
100 for (n=0; n < acl->a_count; n++) {
101 const struct posix_acl_entry *acl_e = &acl->a_entries[n];
102 ext3_acl_entry *entry = (ext3_acl_entry *)e;
103 entry->e_tag = cpu_to_le16(acl_e->e_tag);
104 entry->e_perm = cpu_to_le16(acl_e->e_perm);
105 switch(acl_e->e_tag) {
106 case ACL_USER:
107 entry->e_id = cpu_to_le32(
108 from_kuid(&init_user_ns, acl_e->e_uid));
109 e += sizeof(ext3_acl_entry);
110 break;
111 case ACL_GROUP:
112 entry->e_id = cpu_to_le32(
113 from_kgid(&init_user_ns, acl_e->e_gid));
114 e += sizeof(ext3_acl_entry);
115 break;
116
117 case ACL_USER_OBJ:
118 case ACL_GROUP_OBJ:
119 case ACL_MASK:
120 case ACL_OTHER:
121 e += sizeof(ext3_acl_entry_short);
122 break;
123
124 default:
125 goto fail;
126 }
127 }
128 return (char *)ext_acl;
129
130fail:
131 kfree(ext_acl);
132 return ERR_PTR(-EINVAL);
133}
134
135/*
136 * Inode operation get_posix_acl().
137 *
138 * inode->i_mutex: don't care
139 */
140struct posix_acl *
141ext3_get_acl(struct inode *inode, int type)
142{
143 int name_index;
144 char *value = NULL;
145 struct posix_acl *acl;
146 int retval;
147
148 switch (type) {
149 case ACL_TYPE_ACCESS:
150 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
151 break;
152 case ACL_TYPE_DEFAULT:
153 name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
154 break;
155 default:
156 BUG();
157 }
158
159 retval = ext3_xattr_get(inode, name_index, "", NULL, 0);
160 if (retval > 0) {
161 value = kmalloc(retval, GFP_NOFS);
162 if (!value)
163 return ERR_PTR(-ENOMEM);
164 retval = ext3_xattr_get(inode, name_index, "", value, retval);
165 }
166 if (retval > 0)
167 acl = ext3_acl_from_disk(value, retval);
168 else if (retval == -ENODATA || retval == -ENOSYS)
169 acl = NULL;
170 else
171 acl = ERR_PTR(retval);
172 kfree(value);
173
174 if (!IS_ERR(acl))
175 set_cached_acl(inode, type, acl);
176
177 return acl;
178}
179
180/*
181 * Set the access or default ACL of an inode.
182 *
183 * inode->i_mutex: down unless called from ext3_new_inode
184 */
185static int
186__ext3_set_acl(handle_t *handle, struct inode *inode, int type,
187 struct posix_acl *acl)
188{
189 int name_index;
190 void *value = NULL;
191 size_t size = 0;
192 int error;
193
194 switch(type) {
195 case ACL_TYPE_ACCESS:
196 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
197 if (acl) {
198 error = posix_acl_equiv_mode(acl, &inode->i_mode);
199 if (error < 0)
200 return error;
201 else {
202 inode->i_ctime = CURRENT_TIME_SEC;
203 ext3_mark_inode_dirty(handle, inode);
204 if (error == 0)
205 acl = NULL;
206 }
207 }
208 break;
209
210 case ACL_TYPE_DEFAULT:
211 name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
212 if (!S_ISDIR(inode->i_mode))
213 return acl ? -EACCES : 0;
214 break;
215
216 default:
217 return -EINVAL;
218 }
219 if (acl) {
220 value = ext3_acl_to_disk(acl, &size);
221 if (IS_ERR(value))
222 return (int)PTR_ERR(value);
223 }
224
225 error = ext3_xattr_set_handle(handle, inode, name_index, "",
226 value, size, 0);
227
228 kfree(value);
229
230 if (!error)
231 set_cached_acl(inode, type, acl);
232
233 return error;
234}
235
236int
237ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
238{
239 handle_t *handle;
240 int error, retries = 0;
241
242retry:
243 handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
244 if (IS_ERR(handle))
245 return PTR_ERR(handle);
246 error = __ext3_set_acl(handle, inode, type, acl);
247 ext3_journal_stop(handle);
248 if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
249 goto retry;
250 return error;
251}
252
253/*
254 * Initialize the ACLs of a new inode. Called from ext3_new_inode.
255 *
256 * dir->i_mutex: down
257 * inode->i_mutex: up (access to inode is still exclusive)
258 */
259int
260ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
261{
262 struct posix_acl *default_acl, *acl;
263 int error;
264
265 error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
266 if (error)
267 return error;
268
269 if (default_acl) {
270 error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT,
271 default_acl);
272 posix_acl_release(default_acl);
273 }
274 if (acl) {
275 if (!error)
276 error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS,
277 acl);
278 posix_acl_release(acl);
279 }
280 return error;
281}
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
deleted file mode 100644
index ea1c69edab9e..000000000000
--- a/fs/ext3/acl.h
+++ /dev/null
@@ -1,72 +0,0 @@
1/*
2 File: fs/ext3/acl.h
3
4 (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
5*/
6
7#include <linux/posix_acl_xattr.h>
8
9#define EXT3_ACL_VERSION 0x0001
10
11typedef struct {
12 __le16 e_tag;
13 __le16 e_perm;
14 __le32 e_id;
15} ext3_acl_entry;
16
17typedef struct {
18 __le16 e_tag;
19 __le16 e_perm;
20} ext3_acl_entry_short;
21
22typedef struct {
23 __le32 a_version;
24} ext3_acl_header;
25
26static inline size_t ext3_acl_size(int count)
27{
28 if (count <= 4) {
29 return sizeof(ext3_acl_header) +
30 count * sizeof(ext3_acl_entry_short);
31 } else {
32 return sizeof(ext3_acl_header) +
33 4 * sizeof(ext3_acl_entry_short) +
34 (count - 4) * sizeof(ext3_acl_entry);
35 }
36}
37
38static inline int ext3_acl_count(size_t size)
39{
40 ssize_t s;
41 size -= sizeof(ext3_acl_header);
42 s = size - 4 * sizeof(ext3_acl_entry_short);
43 if (s < 0) {
44 if (size % sizeof(ext3_acl_entry_short))
45 return -1;
46 return size / sizeof(ext3_acl_entry_short);
47 } else {
48 if (s % sizeof(ext3_acl_entry))
49 return -1;
50 return s / sizeof(ext3_acl_entry) + 4;
51 }
52}
53
54#ifdef CONFIG_EXT3_FS_POSIX_ACL
55
56/* acl.c */
57extern struct posix_acl *ext3_get_acl(struct inode *inode, int type);
58extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
59extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
60
61#else /* CONFIG_EXT3_FS_POSIX_ACL */
62#include <linux/sched.h>
63#define ext3_get_acl NULL
64#define ext3_set_acl NULL
65
66static inline int
67ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
68{
69 return 0;
70}
71#endif /* CONFIG_EXT3_FS_POSIX_ACL */
72
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
deleted file mode 100644
index 158b5d4ce067..000000000000
--- a/fs/ext3/balloc.c
+++ /dev/null
@@ -1,2158 +0,0 @@
1/*
2 * linux/fs/ext3/balloc.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
10 * Big-endian to little-endian byte-swapping/bitmaps by
11 * David S. Miller (davem@caip.rutgers.edu), 1995
12 */
13
14#include <linux/quotaops.h>
15#include <linux/blkdev.h>
16#include "ext3.h"
17
18/*
19 * balloc.c contains the blocks allocation and deallocation routines
20 */
21
22/*
23 * The free blocks are managed by bitmaps. A file system contains several
24 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
25 * block for inodes, N blocks for the inode table and data blocks.
26 *
27 * The file system contains group descriptors which are located after the
28 * super block. Each descriptor contains the number of the bitmap block and
29 * the free blocks count in the block. The descriptors are loaded in memory
30 * when a file system is mounted (see ext3_fill_super).
31 */
32
33
34#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
35
36/*
37 * Calculate the block group number and offset, given a block number
38 */
39static void ext3_get_group_no_and_offset(struct super_block *sb,
40 ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
41{
42 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
43
44 blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
45 if (offsetp)
46 *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
47 if (blockgrpp)
48 *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
49}
50
51/**
52 * ext3_get_group_desc() -- load group descriptor from disk
53 * @sb: super block
54 * @block_group: given block group
55 * @bh: pointer to the buffer head to store the block
56 * group descriptor
57 */
58struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
59 unsigned int block_group,
60 struct buffer_head ** bh)
61{
62 unsigned long group_desc;
63 unsigned long offset;
64 struct ext3_group_desc * desc;
65 struct ext3_sb_info *sbi = EXT3_SB(sb);
66
67 if (block_group >= sbi->s_groups_count) {
68 ext3_error (sb, "ext3_get_group_desc",
69 "block_group >= groups_count - "
70 "block_group = %d, groups_count = %lu",
71 block_group, sbi->s_groups_count);
72
73 return NULL;
74 }
75 smp_rmb();
76
77 group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
78 offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
79 if (!sbi->s_group_desc[group_desc]) {
80 ext3_error (sb, "ext3_get_group_desc",
81 "Group descriptor not loaded - "
82 "block_group = %d, group_desc = %lu, desc = %lu",
83 block_group, group_desc, offset);
84 return NULL;
85 }
86
87 desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data;
88 if (bh)
89 *bh = sbi->s_group_desc[group_desc];
90 return desc + offset;
91}
92
93static int ext3_valid_block_bitmap(struct super_block *sb,
94 struct ext3_group_desc *desc,
95 unsigned int block_group,
96 struct buffer_head *bh)
97{
98 ext3_grpblk_t offset;
99 ext3_grpblk_t next_zero_bit;
100 ext3_fsblk_t bitmap_blk;
101 ext3_fsblk_t group_first_block;
102
103 group_first_block = ext3_group_first_block_no(sb, block_group);
104
105 /* check whether block bitmap block number is set */
106 bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
107 offset = bitmap_blk - group_first_block;
108 if (!ext3_test_bit(offset, bh->b_data))
109 /* bad block bitmap */
110 goto err_out;
111
112 /* check whether the inode bitmap block number is set */
113 bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
114 offset = bitmap_blk - group_first_block;
115 if (!ext3_test_bit(offset, bh->b_data))
116 /* bad block bitmap */
117 goto err_out;
118
119 /* check whether the inode table block number is set */
120 bitmap_blk = le32_to_cpu(desc->bg_inode_table);
121 offset = bitmap_blk - group_first_block;
122 next_zero_bit = ext3_find_next_zero_bit(bh->b_data,
123 offset + EXT3_SB(sb)->s_itb_per_group,
124 offset);
125 if (next_zero_bit >= offset + EXT3_SB(sb)->s_itb_per_group)
126 /* good bitmap for inode tables */
127 return 1;
128
129err_out:
130 ext3_error(sb, __func__,
131 "Invalid block bitmap - "
132 "block_group = %d, block = %lu",
133 block_group, bitmap_blk);
134 return 0;
135}
136
137/**
138 * read_block_bitmap()
139 * @sb: super block
140 * @block_group: given block group
141 *
142 * Read the bitmap for a given block_group,and validate the
143 * bits for block/inode/inode tables are set in the bitmaps
144 *
145 * Return buffer_head on success or NULL in case of failure.
146 */
147static struct buffer_head *
148read_block_bitmap(struct super_block *sb, unsigned int block_group)
149{
150 struct ext3_group_desc * desc;
151 struct buffer_head * bh = NULL;
152 ext3_fsblk_t bitmap_blk;
153
154 desc = ext3_get_group_desc(sb, block_group, NULL);
155 if (!desc)
156 return NULL;
157 trace_ext3_read_block_bitmap(sb, block_group);
158 bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
159 bh = sb_getblk(sb, bitmap_blk);
160 if (unlikely(!bh)) {
161 ext3_error(sb, __func__,
162 "Cannot read block bitmap - "
163 "block_group = %d, block_bitmap = %u",
164 block_group, le32_to_cpu(desc->bg_block_bitmap));
165 return NULL;
166 }
167 if (likely(bh_uptodate_or_lock(bh)))
168 return bh;
169
170 if (bh_submit_read(bh) < 0) {
171 brelse(bh);
172 ext3_error(sb, __func__,
173 "Cannot read block bitmap - "
174 "block_group = %d, block_bitmap = %u",
175 block_group, le32_to_cpu(desc->bg_block_bitmap));
176 return NULL;
177 }
178 ext3_valid_block_bitmap(sb, desc, block_group, bh);
179 /*
180 * file system mounted not to panic on error, continue with corrupt
181 * bitmap
182 */
183 return bh;
184}
185/*
186 * The reservation window structure operations
187 * --------------------------------------------
188 * Operations include:
189 * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
190 *
191 * We use a red-black tree to represent per-filesystem reservation
192 * windows.
193 *
194 */
195
196/**
197 * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
198 * @rb_root: root of per-filesystem reservation rb tree
199 * @verbose: verbose mode
200 * @fn: function which wishes to dump the reservation map
201 *
202 * If verbose is turned on, it will print the whole block reservation
203 * windows(start, end). Otherwise, it will only print out the "bad" windows,
204 * those windows that overlap with their immediate neighbors.
205 */
206#if 1
207static void __rsv_window_dump(struct rb_root *root, int verbose,
208 const char *fn)
209{
210 struct rb_node *n;
211 struct ext3_reserve_window_node *rsv, *prev;
212 int bad;
213
214restart:
215 n = rb_first(root);
216 bad = 0;
217 prev = NULL;
218
219 printk("Block Allocation Reservation Windows Map (%s):\n", fn);
220 while (n) {
221 rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
222 if (verbose)
223 printk("reservation window 0x%p "
224 "start: %lu, end: %lu\n",
225 rsv, rsv->rsv_start, rsv->rsv_end);
226 if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
227 printk("Bad reservation %p (start >= end)\n",
228 rsv);
229 bad = 1;
230 }
231 if (prev && prev->rsv_end >= rsv->rsv_start) {
232 printk("Bad reservation %p (prev->end >= start)\n",
233 rsv);
234 bad = 1;
235 }
236 if (bad) {
237 if (!verbose) {
238 printk("Restarting reservation walk in verbose mode\n");
239 verbose = 1;
240 goto restart;
241 }
242 }
243 n = rb_next(n);
244 prev = rsv;
245 }
246 printk("Window map complete.\n");
247 BUG_ON(bad);
248}
249#define rsv_window_dump(root, verbose) \
250 __rsv_window_dump((root), (verbose), __func__)
251#else
252#define rsv_window_dump(root, verbose) do {} while (0)
253#endif
254
255/**
256 * goal_in_my_reservation()
257 * @rsv: inode's reservation window
258 * @grp_goal: given goal block relative to the allocation block group
259 * @group: the current allocation block group
260 * @sb: filesystem super block
261 *
262 * Test if the given goal block (group relative) is within the file's
263 * own block reservation window range.
264 *
265 * If the reservation window is outside the goal allocation group, return 0;
266 * grp_goal (given goal block) could be -1, which means no specific
267 * goal block. In this case, always return 1.
268 * If the goal block is within the reservation window, return 1;
269 * otherwise, return 0;
270 */
271static int
272goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
273 unsigned int group, struct super_block * sb)
274{
275 ext3_fsblk_t group_first_block, group_last_block;
276
277 group_first_block = ext3_group_first_block_no(sb, group);
278 group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
279
280 if ((rsv->_rsv_start > group_last_block) ||
281 (rsv->_rsv_end < group_first_block))
282 return 0;
283 if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
284 || (grp_goal + group_first_block > rsv->_rsv_end)))
285 return 0;
286 return 1;
287}
288
289/**
290 * search_reserve_window()
291 * @rb_root: root of reservation tree
292 * @goal: target allocation block
293 *
294 * Find the reserved window which includes the goal, or the previous one
295 * if the goal is not in any window.
296 * Returns NULL if there are no windows or if all windows start after the goal.
297 */
298static struct ext3_reserve_window_node *
299search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
300{
301 struct rb_node *n = root->rb_node;
302 struct ext3_reserve_window_node *rsv;
303
304 if (!n)
305 return NULL;
306
307 do {
308 rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
309
310 if (goal < rsv->rsv_start)
311 n = n->rb_left;
312 else if (goal > rsv->rsv_end)
313 n = n->rb_right;
314 else
315 return rsv;
316 } while (n);
317 /*
318 * We've fallen off the end of the tree: the goal wasn't inside
319 * any particular node. OK, the previous node must be to one
320 * side of the interval containing the goal. If it's the RHS,
321 * we need to back up one.
322 */
323 if (rsv->rsv_start > goal) {
324 n = rb_prev(&rsv->rsv_node);
325 rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
326 }
327 return rsv;
328}
329
330/**
331 * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree.
332 * @sb: super block
333 * @rsv: reservation window to add
334 *
335 * Must be called with rsv_lock hold.
336 */
337void ext3_rsv_window_add(struct super_block *sb,
338 struct ext3_reserve_window_node *rsv)
339{
340 struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
341 struct rb_node *node = &rsv->rsv_node;
342 ext3_fsblk_t start = rsv->rsv_start;
343
344 struct rb_node ** p = &root->rb_node;
345 struct rb_node * parent = NULL;
346 struct ext3_reserve_window_node *this;
347
348 trace_ext3_rsv_window_add(sb, rsv);
349 while (*p)
350 {
351 parent = *p;
352 this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node);
353
354 if (start < this->rsv_start)
355 p = &(*p)->rb_left;
356 else if (start > this->rsv_end)
357 p = &(*p)->rb_right;
358 else {
359 rsv_window_dump(root, 1);
360 BUG();
361 }
362 }
363
364 rb_link_node(node, parent, p);
365 rb_insert_color(node, root);
366}
367
368/**
369 * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree
370 * @sb: super block
371 * @rsv: reservation window to remove
372 *
373 * Mark the block reservation window as not allocated, and unlink it
374 * from the filesystem reservation window rb tree. Must be called with
375 * rsv_lock hold.
376 */
377static void rsv_window_remove(struct super_block *sb,
378 struct ext3_reserve_window_node *rsv)
379{
380 rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
381 rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
382 rsv->rsv_alloc_hit = 0;
383 rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
384}
385
386/*
387 * rsv_is_empty() -- Check if the reservation window is allocated.
388 * @rsv: given reservation window to check
389 *
390 * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED.
391 */
392static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
393{
394 /* a valid reservation end block could not be 0 */
395 return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
396}
397
398/**
399 * ext3_init_block_alloc_info()
400 * @inode: file inode structure
401 *
402 * Allocate and initialize the reservation window structure, and
403 * link the window to the ext3 inode structure at last
404 *
405 * The reservation window structure is only dynamically allocated
406 * and linked to ext3 inode the first time the open file
407 * needs a new block. So, before every ext3_new_block(s) call, for
408 * regular files, we should check whether the reservation window
409 * structure exists or not. In the latter case, this function is called.
410 * Fail to do so will result in block reservation being turned off for that
411 * open file.
412 *
413 * This function is called from ext3_get_blocks_handle(), also called
414 * when setting the reservation window size through ioctl before the file
415 * is open for write (needs block allocation).
416 *
417 * Needs truncate_mutex protection prior to call this function.
418 */
419void ext3_init_block_alloc_info(struct inode *inode)
420{
421 struct ext3_inode_info *ei = EXT3_I(inode);
422 struct ext3_block_alloc_info *block_i;
423 struct super_block *sb = inode->i_sb;
424
425 block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
426 if (block_i) {
427 struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node;
428
429 rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
430 rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
431
432 /*
433 * if filesystem is mounted with NORESERVATION, the goal
434 * reservation window size is set to zero to indicate
435 * block reservation is off
436 */
437 if (!test_opt(sb, RESERVATION))
438 rsv->rsv_goal_size = 0;
439 else
440 rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS;
441 rsv->rsv_alloc_hit = 0;
442 block_i->last_alloc_logical_block = 0;
443 block_i->last_alloc_physical_block = 0;
444 }
445 ei->i_block_alloc_info = block_i;
446}
447
448/**
449 * ext3_discard_reservation()
450 * @inode: inode
451 *
452 * Discard(free) block reservation window on last file close, or truncate
453 * or at last iput().
454 *
455 * It is being called in three cases:
456 * ext3_release_file(): last writer close the file
457 * ext3_clear_inode(): last iput(), when nobody link to this file.
458 * ext3_truncate(): when the block indirect map is about to change.
459 *
460 */
461void ext3_discard_reservation(struct inode *inode)
462{
463 struct ext3_inode_info *ei = EXT3_I(inode);
464 struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
465 struct ext3_reserve_window_node *rsv;
466 spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock;
467
468 if (!block_i)
469 return;
470
471 rsv = &block_i->rsv_window_node;
472 if (!rsv_is_empty(&rsv->rsv_window)) {
473 spin_lock(rsv_lock);
474 if (!rsv_is_empty(&rsv->rsv_window)) {
475 trace_ext3_discard_reservation(inode, rsv);
476 rsv_window_remove(inode->i_sb, rsv);
477 }
478 spin_unlock(rsv_lock);
479 }
480}
481
482/**
483 * ext3_free_blocks_sb() -- Free given blocks and update quota
484 * @handle: handle to this transaction
485 * @sb: super block
486 * @block: start physical block to free
487 * @count: number of blocks to free
488 * @pdquot_freed_blocks: pointer to quota
489 */
490void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
491 ext3_fsblk_t block, unsigned long count,
492 unsigned long *pdquot_freed_blocks)
493{
494 struct buffer_head *bitmap_bh = NULL;
495 struct buffer_head *gd_bh;
496 unsigned long block_group;
497 ext3_grpblk_t bit;
498 unsigned long i;
499 unsigned long overflow;
500 struct ext3_group_desc * desc;
501 struct ext3_super_block * es;
502 struct ext3_sb_info *sbi;
503 int err = 0, ret;
504 ext3_grpblk_t group_freed;
505
506 *pdquot_freed_blocks = 0;
507 sbi = EXT3_SB(sb);
508 es = sbi->s_es;
509 if (block < le32_to_cpu(es->s_first_data_block) ||
510 block + count < block ||
511 block + count > le32_to_cpu(es->s_blocks_count)) {
512 ext3_error (sb, "ext3_free_blocks",
513 "Freeing blocks not in datazone - "
514 "block = "E3FSBLK", count = %lu", block, count);
515 goto error_return;
516 }
517
518 ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1);
519
520do_more:
521 overflow = 0;
522 block_group = (block - le32_to_cpu(es->s_first_data_block)) /
523 EXT3_BLOCKS_PER_GROUP(sb);
524 bit = (block - le32_to_cpu(es->s_first_data_block)) %
525 EXT3_BLOCKS_PER_GROUP(sb);
526 /*
527 * Check to see if we are freeing blocks across a group
528 * boundary.
529 */
530 if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
531 overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
532 count -= overflow;
533 }
534 brelse(bitmap_bh);
535 bitmap_bh = read_block_bitmap(sb, block_group);
536 if (!bitmap_bh)
537 goto error_return;
538 desc = ext3_get_group_desc (sb, block_group, &gd_bh);
539 if (!desc)
540 goto error_return;
541
542 if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
543 in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
544 in_range (block, le32_to_cpu(desc->bg_inode_table),
545 sbi->s_itb_per_group) ||
546 in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
547 sbi->s_itb_per_group)) {
548 ext3_error (sb, "ext3_free_blocks",
549 "Freeing blocks in system zones - "
550 "Block = "E3FSBLK", count = %lu",
551 block, count);
552 goto error_return;
553 }
554
555 /*
556 * We are about to start releasing blocks in the bitmap,
557 * so we need undo access.
558 */
559 /* @@@ check errors */
560 BUFFER_TRACE(bitmap_bh, "getting undo access");
561 err = ext3_journal_get_undo_access(handle, bitmap_bh);
562 if (err)
563 goto error_return;
564
565 /*
566 * We are about to modify some metadata. Call the journal APIs
567 * to unshare ->b_data if a currently-committing transaction is
568 * using it
569 */
570 BUFFER_TRACE(gd_bh, "get_write_access");
571 err = ext3_journal_get_write_access(handle, gd_bh);
572 if (err)
573 goto error_return;
574
575 jbd_lock_bh_state(bitmap_bh);
576
577 for (i = 0, group_freed = 0; i < count; i++) {
578 /*
579 * An HJ special. This is expensive...
580 */
581#ifdef CONFIG_JBD_DEBUG
582 jbd_unlock_bh_state(bitmap_bh);
583 {
584 struct buffer_head *debug_bh;
585 debug_bh = sb_find_get_block(sb, block + i);
586 if (debug_bh) {
587 BUFFER_TRACE(debug_bh, "Deleted!");
588 if (!bh2jh(bitmap_bh)->b_committed_data)
589 BUFFER_TRACE(debug_bh,
590 "No committed data in bitmap");
591 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
592 __brelse(debug_bh);
593 }
594 }
595 jbd_lock_bh_state(bitmap_bh);
596#endif
597 if (need_resched()) {
598 jbd_unlock_bh_state(bitmap_bh);
599 cond_resched();
600 jbd_lock_bh_state(bitmap_bh);
601 }
602 /* @@@ This prevents newly-allocated data from being
603 * freed and then reallocated within the same
604 * transaction.
605 *
606 * Ideally we would want to allow that to happen, but to
607 * do so requires making journal_forget() capable of
608 * revoking the queued write of a data block, which
609 * implies blocking on the journal lock. *forget()
610 * cannot block due to truncate races.
611 *
612 * Eventually we can fix this by making journal_forget()
613 * return a status indicating whether or not it was able
614 * to revoke the buffer. On successful revoke, it is
615 * safe not to set the allocation bit in the committed
616 * bitmap, because we know that there is no outstanding
617 * activity on the buffer any more and so it is safe to
618 * reallocate it.
619 */
620 BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
621 J_ASSERT_BH(bitmap_bh,
622 bh2jh(bitmap_bh)->b_committed_data != NULL);
623 ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
624 bh2jh(bitmap_bh)->b_committed_data);
625
626 /*
627 * We clear the bit in the bitmap after setting the committed
628 * data bit, because this is the reverse order to that which
629 * the allocator uses.
630 */
631 BUFFER_TRACE(bitmap_bh, "clear bit");
632 if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
633 bit + i, bitmap_bh->b_data)) {
634 jbd_unlock_bh_state(bitmap_bh);
635 ext3_error(sb, __func__,
636 "bit already cleared for block "E3FSBLK,
637 block + i);
638 jbd_lock_bh_state(bitmap_bh);
639 BUFFER_TRACE(bitmap_bh, "bit already cleared");
640 } else {
641 group_freed++;
642 }
643 }
644 jbd_unlock_bh_state(bitmap_bh);
645
646 spin_lock(sb_bgl_lock(sbi, block_group));
647 le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
648 spin_unlock(sb_bgl_lock(sbi, block_group));
649 percpu_counter_add(&sbi->s_freeblocks_counter, count);
650
651 /* We dirtied the bitmap block */
652 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
653 err = ext3_journal_dirty_metadata(handle, bitmap_bh);
654
655 /* And the group descriptor block */
656 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
657 ret = ext3_journal_dirty_metadata(handle, gd_bh);
658 if (!err) err = ret;
659 *pdquot_freed_blocks += group_freed;
660
661 if (overflow && !err) {
662 block += count;
663 count = overflow;
664 goto do_more;
665 }
666
667error_return:
668 brelse(bitmap_bh);
669 ext3_std_error(sb, err);
670 return;
671}
672
673/**
674 * ext3_free_blocks() -- Free given blocks and update quota
675 * @handle: handle for this transaction
676 * @inode: inode
677 * @block: start physical block to free
678 * @count: number of blocks to count
679 */
680void ext3_free_blocks(handle_t *handle, struct inode *inode,
681 ext3_fsblk_t block, unsigned long count)
682{
683 struct super_block *sb = inode->i_sb;
684 unsigned long dquot_freed_blocks;
685
686 trace_ext3_free_blocks(inode, block, count);
687 ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
688 if (dquot_freed_blocks)
689 dquot_free_block(inode, dquot_freed_blocks);
690 return;
691}
692
693/**
694 * ext3_test_allocatable()
695 * @nr: given allocation block group
696 * @bh: bufferhead contains the bitmap of the given block group
697 *
698 * For ext3 allocations, we must not reuse any blocks which are
699 * allocated in the bitmap buffer's "last committed data" copy. This
700 * prevents deletes from freeing up the page for reuse until we have
701 * committed the delete transaction.
702 *
703 * If we didn't do this, then deleting something and reallocating it as
704 * data would allow the old block to be overwritten before the
705 * transaction committed (because we force data to disk before commit).
706 * This would lead to corruption if we crashed between overwriting the
707 * data and committing the delete.
708 *
709 * @@@ We may want to make this allocation behaviour conditional on
710 * data-writes at some point, and disable it for metadata allocations or
711 * sync-data inodes.
712 */
713static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
714{
715 int ret;
716 struct journal_head *jh = bh2jh(bh);
717
718 if (ext3_test_bit(nr, bh->b_data))
719 return 0;
720
721 jbd_lock_bh_state(bh);
722 if (!jh->b_committed_data)
723 ret = 1;
724 else
725 ret = !ext3_test_bit(nr, jh->b_committed_data);
726 jbd_unlock_bh_state(bh);
727 return ret;
728}
729
730/**
731 * bitmap_search_next_usable_block()
732 * @start: the starting block (group relative) of the search
733 * @bh: bufferhead contains the block group bitmap
734 * @maxblocks: the ending block (group relative) of the reservation
735 *
736 * The bitmap search --- search forward alternately through the actual
737 * bitmap on disk and the last-committed copy in journal, until we find a
738 * bit free in both bitmaps.
739 */
740static ext3_grpblk_t
741bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
742 ext3_grpblk_t maxblocks)
743{
744 ext3_grpblk_t next;
745 struct journal_head *jh = bh2jh(bh);
746
747 while (start < maxblocks) {
748 next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
749 if (next >= maxblocks)
750 return -1;
751 if (ext3_test_allocatable(next, bh))
752 return next;
753 jbd_lock_bh_state(bh);
754 if (jh->b_committed_data)
755 start = ext3_find_next_zero_bit(jh->b_committed_data,
756 maxblocks, next);
757 jbd_unlock_bh_state(bh);
758 }
759 return -1;
760}
761
762/**
763 * find_next_usable_block()
764 * @start: the starting block (group relative) to find next
765 * allocatable block in bitmap.
766 * @bh: bufferhead contains the block group bitmap
767 * @maxblocks: the ending block (group relative) for the search
768 *
769 * Find an allocatable block in a bitmap. We honor both the bitmap and
770 * its last-committed copy (if that exists), and perform the "most
771 * appropriate allocation" algorithm of looking for a free block near
772 * the initial goal; then for a free byte somewhere in the bitmap; then
773 * for any free bit in the bitmap.
774 */
775static ext3_grpblk_t
776find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
777 ext3_grpblk_t maxblocks)
778{
779 ext3_grpblk_t here, next;
780 char *p, *r;
781
782 if (start > 0) {
783 /*
784 * The goal was occupied; search forward for a free
785 * block within the next XX blocks.
786 *
787 * end_goal is more or less random, but it has to be
788 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
789 * next 64-bit boundary is simple..
790 */
791 ext3_grpblk_t end_goal = (start + 63) & ~63;
792 if (end_goal > maxblocks)
793 end_goal = maxblocks;
794 here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
795 if (here < end_goal && ext3_test_allocatable(here, bh))
796 return here;
797 ext3_debug("Bit not found near goal\n");
798 }
799
800 here = start;
801 if (here < 0)
802 here = 0;
803
804 p = bh->b_data + (here >> 3);
805 r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
806 next = (r - bh->b_data) << 3;
807
808 if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
809 return next;
810
811 /*
812 * The bitmap search --- search forward alternately through the actual
813 * bitmap and the last-committed copy until we find a bit free in
814 * both
815 */
816 here = bitmap_search_next_usable_block(here, bh, maxblocks);
817 return here;
818}
819
820/**
821 * claim_block()
822 * @lock: the spin lock for this block group
823 * @block: the free block (group relative) to allocate
824 * @bh: the buffer_head contains the block group bitmap
825 *
826 * We think we can allocate this block in this bitmap. Try to set the bit.
827 * If that succeeds then check that nobody has allocated and then freed the
828 * block since we saw that is was not marked in b_committed_data. If it _was_
829 * allocated and freed then clear the bit in the bitmap again and return
830 * zero (failure).
831 */
832static inline int
833claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
834{
835 struct journal_head *jh = bh2jh(bh);
836 int ret;
837
838 if (ext3_set_bit_atomic(lock, block, bh->b_data))
839 return 0;
840 jbd_lock_bh_state(bh);
841 if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) {
842 ext3_clear_bit_atomic(lock, block, bh->b_data);
843 ret = 0;
844 } else {
845 ret = 1;
846 }
847 jbd_unlock_bh_state(bh);
848 return ret;
849}
850
851/**
852 * ext3_try_to_allocate()
853 * @sb: superblock
854 * @handle: handle to this transaction
855 * @group: given allocation block group
856 * @bitmap_bh: bufferhead holds the block bitmap
857 * @grp_goal: given target block within the group
858 * @count: target number of blocks to allocate
859 * @my_rsv: reservation window
860 *
861 * Attempt to allocate blocks within a give range. Set the range of allocation
862 * first, then find the first free bit(s) from the bitmap (within the range),
863 * and at last, allocate the blocks by claiming the found free bit as allocated.
864 *
865 * To set the range of this allocation:
866 * if there is a reservation window, only try to allocate block(s) from the
867 * file's own reservation window;
868 * Otherwise, the allocation range starts from the give goal block, ends at
869 * the block group's last block.
870 *
871 * If we failed to allocate the desired block then we may end up crossing to a
872 * new bitmap. In that case we must release write access to the old one via
873 * ext3_journal_release_buffer(), else we'll run out of credits.
874 */
875static ext3_grpblk_t
876ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
877 struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
878 unsigned long *count, struct ext3_reserve_window *my_rsv)
879{
880 ext3_fsblk_t group_first_block;
881 ext3_grpblk_t start, end;
882 unsigned long num = 0;
883
884 /* we do allocation within the reservation window if we have a window */
885 if (my_rsv) {
886 group_first_block = ext3_group_first_block_no(sb, group);
887 if (my_rsv->_rsv_start >= group_first_block)
888 start = my_rsv->_rsv_start - group_first_block;
889 else
890 /* reservation window cross group boundary */
891 start = 0;
892 end = my_rsv->_rsv_end - group_first_block + 1;
893 if (end > EXT3_BLOCKS_PER_GROUP(sb))
894 /* reservation window crosses group boundary */
895 end = EXT3_BLOCKS_PER_GROUP(sb);
896 if ((start <= grp_goal) && (grp_goal < end))
897 start = grp_goal;
898 else
899 grp_goal = -1;
900 } else {
901 if (grp_goal > 0)
902 start = grp_goal;
903 else
904 start = 0;
905 end = EXT3_BLOCKS_PER_GROUP(sb);
906 }
907
908 BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
909
910repeat:
911 if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
912 grp_goal = find_next_usable_block(start, bitmap_bh, end);
913 if (grp_goal < 0)
914 goto fail_access;
915 if (!my_rsv) {
916 int i;
917
918 for (i = 0; i < 7 && grp_goal > start &&
919 ext3_test_allocatable(grp_goal - 1,
920 bitmap_bh);
921 i++, grp_goal--)
922 ;
923 }
924 }
925 start = grp_goal;
926
927 if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group),
928 grp_goal, bitmap_bh)) {
929 /*
930 * The block was allocated by another thread, or it was
931 * allocated and then freed by another thread
932 */
933 start++;
934 grp_goal++;
935 if (start >= end)
936 goto fail_access;
937 goto repeat;
938 }
939 num++;
940 grp_goal++;
941 while (num < *count && grp_goal < end
942 && ext3_test_allocatable(grp_goal, bitmap_bh)
943 && claim_block(sb_bgl_lock(EXT3_SB(sb), group),
944 grp_goal, bitmap_bh)) {
945 num++;
946 grp_goal++;
947 }
948 *count = num;
949 return grp_goal - num;
950fail_access:
951 *count = num;
952 return -1;
953}
954
955/**
956 * find_next_reservable_window():
957 * find a reservable space within the given range.
958 * It does not allocate the reservation window for now:
959 * alloc_new_reservation() will do the work later.
960 *
961 * @search_head: the head of the searching list;
962 * This is not necessarily the list head of the whole filesystem
963 *
964 * We have both head and start_block to assist the search
965 * for the reservable space. The list starts from head,
966 * but we will shift to the place where start_block is,
967 * then start from there, when looking for a reservable space.
968 *
969 * @my_rsv: the reservation window
970 *
971 * @sb: the super block
972 *
973 * @start_block: the first block we consider to start
974 * the real search from
975 *
976 * @last_block:
977 * the maximum block number that our goal reservable space
978 * could start from. This is normally the last block in this
979 * group. The search will end when we found the start of next
980 * possible reservable space is out of this boundary.
981 * This could handle the cross boundary reservation window
982 * request.
983 *
984 * basically we search from the given range, rather than the whole
985 * reservation double linked list, (start_block, last_block)
986 * to find a free region that is of my size and has not
987 * been reserved.
988 *
989 */
990static int find_next_reservable_window(
991 struct ext3_reserve_window_node *search_head,
992 struct ext3_reserve_window_node *my_rsv,
993 struct super_block * sb,
994 ext3_fsblk_t start_block,
995 ext3_fsblk_t last_block)
996{
997 struct rb_node *next;
998 struct ext3_reserve_window_node *rsv, *prev;
999 ext3_fsblk_t cur;
1000 int size = my_rsv->rsv_goal_size;
1001
1002 /* TODO: make the start of the reservation window byte-aligned */
1003 /* cur = *start_block & ~7;*/
1004 cur = start_block;
1005 rsv = search_head;
1006 if (!rsv)
1007 return -1;
1008
1009 while (1) {
1010 if (cur <= rsv->rsv_end)
1011 cur = rsv->rsv_end + 1;
1012
1013 /* TODO?
1014 * in the case we could not find a reservable space
1015 * that is what is expected, during the re-search, we could
1016 * remember what's the largest reservable space we could have
1017 * and return that one.
1018 *
1019 * For now it will fail if we could not find the reservable
1020 * space with expected-size (or more)...
1021 */
1022 if (cur > last_block)
1023 return -1; /* fail */
1024
1025 prev = rsv;
1026 next = rb_next(&rsv->rsv_node);
1027 rsv = rb_entry(next,struct ext3_reserve_window_node,rsv_node);
1028
1029 /*
1030 * Reached the last reservation, we can just append to the
1031 * previous one.
1032 */
1033 if (!next)
1034 break;
1035
1036 if (cur + size <= rsv->rsv_start) {
1037 /*
1038 * Found a reserveable space big enough. We could
1039 * have a reservation across the group boundary here
1040 */
1041 break;
1042 }
1043 }
1044 /*
1045 * we come here either :
1046 * when we reach the end of the whole list,
1047 * and there is empty reservable space after last entry in the list.
1048 * append it to the end of the list.
1049 *
1050 * or we found one reservable space in the middle of the list,
1051 * return the reservation window that we could append to.
1052 * succeed.
1053 */
1054
1055 if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
1056 rsv_window_remove(sb, my_rsv);
1057
1058 /*
1059 * Let's book the whole available window for now. We will check the
1060 * disk bitmap later and then, if there are free blocks then we adjust
1061 * the window size if it's larger than requested.
1062 * Otherwise, we will remove this node from the tree next time
1063 * call find_next_reservable_window.
1064 */
1065 my_rsv->rsv_start = cur;
1066 my_rsv->rsv_end = cur + size - 1;
1067 my_rsv->rsv_alloc_hit = 0;
1068
1069 if (prev != my_rsv)
1070 ext3_rsv_window_add(sb, my_rsv);
1071
1072 return 0;
1073}
1074
1075/**
1076 * alloc_new_reservation()--allocate a new reservation window
1077 *
1078 * To make a new reservation, we search part of the filesystem
1079 * reservation list (the list that inside the group). We try to
1080 * allocate a new reservation window near the allocation goal,
1081 * or the beginning of the group, if there is no goal.
1082 *
1083 * We first find a reservable space after the goal, then from
1084 * there, we check the bitmap for the first free block after
1085 * it. If there is no free block until the end of group, then the
1086 * whole group is full, we failed. Otherwise, check if the free
1087 * block is inside the expected reservable space, if so, we
1088 * succeed.
1089 * If the first free block is outside the reservable space, then
1090 * start from the first free block, we search for next available
1091 * space, and go on.
1092 *
1093 * on succeed, a new reservation will be found and inserted into the list
1094 * It contains at least one free block, and it does not overlap with other
1095 * reservation windows.
1096 *
1097 * failed: we failed to find a reservation window in this group
1098 *
1099 * @my_rsv: the reservation window
1100 *
1101 * @grp_goal: The goal (group-relative). It is where the search for a
1102 * free reservable space should start from.
1103 * if we have a grp_goal(grp_goal >0 ), then start from there,
1104 * no grp_goal(grp_goal = -1), we start from the first block
1105 * of the group.
1106 *
1107 * @sb: the super block
1108 * @group: the group we are trying to allocate in
1109 * @bitmap_bh: the block group block bitmap
1110 *
1111 */
1112static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
1113 ext3_grpblk_t grp_goal, struct super_block *sb,
1114 unsigned int group, struct buffer_head *bitmap_bh)
1115{
1116 struct ext3_reserve_window_node *search_head;
1117 ext3_fsblk_t group_first_block, group_end_block, start_block;
1118 ext3_grpblk_t first_free_block;
1119 struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
1120 unsigned long size;
1121 int ret;
1122 spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
1123
1124 group_first_block = ext3_group_first_block_no(sb, group);
1125 group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
1126
1127 if (grp_goal < 0)
1128 start_block = group_first_block;
1129 else
1130 start_block = grp_goal + group_first_block;
1131
1132 trace_ext3_alloc_new_reservation(sb, start_block);
1133 size = my_rsv->rsv_goal_size;
1134
1135 if (!rsv_is_empty(&my_rsv->rsv_window)) {
1136 /*
1137 * if the old reservation is cross group boundary
1138 * and if the goal is inside the old reservation window,
1139 * we will come here when we just failed to allocate from
1140 * the first part of the window. We still have another part
1141 * that belongs to the next group. In this case, there is no
1142 * point to discard our window and try to allocate a new one
1143 * in this group(which will fail). we should
1144 * keep the reservation window, just simply move on.
1145 *
1146 * Maybe we could shift the start block of the reservation
1147 * window to the first block of next group.
1148 */
1149
1150 if ((my_rsv->rsv_start <= group_end_block) &&
1151 (my_rsv->rsv_end > group_end_block) &&
1152 (start_block >= my_rsv->rsv_start))
1153 return -1;
1154
1155 if ((my_rsv->rsv_alloc_hit >
1156 (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
1157 /*
1158 * if the previously allocation hit ratio is
1159 * greater than 1/2, then we double the size of
1160 * the reservation window the next time,
1161 * otherwise we keep the same size window
1162 */
1163 size = size * 2;
1164 if (size > EXT3_MAX_RESERVE_BLOCKS)
1165 size = EXT3_MAX_RESERVE_BLOCKS;
1166 my_rsv->rsv_goal_size= size;
1167 }
1168 }
1169
1170 spin_lock(rsv_lock);
1171 /*
1172 * shift the search start to the window near the goal block
1173 */
1174 search_head = search_reserve_window(fs_rsv_root, start_block);
1175
1176 /*
1177 * find_next_reservable_window() simply finds a reservable window
1178 * inside the given range(start_block, group_end_block).
1179 *
1180 * To make sure the reservation window has a free bit inside it, we
1181 * need to check the bitmap after we found a reservable window.
1182 */
1183retry:
1184 ret = find_next_reservable_window(search_head, my_rsv, sb,
1185 start_block, group_end_block);
1186
1187 if (ret == -1) {
1188 if (!rsv_is_empty(&my_rsv->rsv_window))
1189 rsv_window_remove(sb, my_rsv);
1190 spin_unlock(rsv_lock);
1191 return -1;
1192 }
1193
1194 /*
1195 * On success, find_next_reservable_window() returns the
1196 * reservation window where there is a reservable space after it.
1197 * Before we reserve this reservable space, we need
1198 * to make sure there is at least a free block inside this region.
1199 *
1200 * searching the first free bit on the block bitmap and copy of
1201 * last committed bitmap alternatively, until we found a allocatable
1202 * block. Search start from the start block of the reservable space
1203 * we just found.
1204 */
1205 spin_unlock(rsv_lock);
1206 first_free_block = bitmap_search_next_usable_block(
1207 my_rsv->rsv_start - group_first_block,
1208 bitmap_bh, group_end_block - group_first_block + 1);
1209
1210 if (first_free_block < 0) {
1211 /*
1212 * no free block left on the bitmap, no point
1213 * to reserve the space. return failed.
1214 */
1215 spin_lock(rsv_lock);
1216 if (!rsv_is_empty(&my_rsv->rsv_window))
1217 rsv_window_remove(sb, my_rsv);
1218 spin_unlock(rsv_lock);
1219 return -1; /* failed */
1220 }
1221
1222 start_block = first_free_block + group_first_block;
1223 /*
1224 * check if the first free block is within the
1225 * free space we just reserved
1226 */
1227 if (start_block >= my_rsv->rsv_start &&
1228 start_block <= my_rsv->rsv_end) {
1229 trace_ext3_reserved(sb, start_block, my_rsv);
1230 return 0; /* success */
1231 }
1232 /*
1233 * if the first free bit we found is out of the reservable space
1234 * continue search for next reservable space,
1235 * start from where the free block is,
1236 * we also shift the list head to where we stopped last time
1237 */
1238 search_head = my_rsv;
1239 spin_lock(rsv_lock);
1240 goto retry;
1241}
1242
1243/**
1244 * try_to_extend_reservation()
1245 * @my_rsv: given reservation window
1246 * @sb: super block
1247 * @size: the delta to extend
1248 *
1249 * Attempt to expand the reservation window large enough to have
1250 * required number of free blocks
1251 *
1252 * Since ext3_try_to_allocate() will always allocate blocks within
1253 * the reservation window range, if the window size is too small,
1254 * multiple blocks allocation has to stop at the end of the reservation
1255 * window. To make this more efficient, given the total number of
1256 * blocks needed and the current size of the window, we try to
1257 * expand the reservation window size if necessary on a best-effort
1258 * basis before ext3_new_blocks() tries to allocate blocks,
1259 */
1260static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
1261 struct super_block *sb, int size)
1262{
1263 struct ext3_reserve_window_node *next_rsv;
1264 struct rb_node *next;
1265 spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
1266
1267 if (!spin_trylock(rsv_lock))
1268 return;
1269
1270 next = rb_next(&my_rsv->rsv_node);
1271
1272 if (!next)
1273 my_rsv->rsv_end += size;
1274 else {
1275 next_rsv = rb_entry(next, struct ext3_reserve_window_node, rsv_node);
1276
1277 if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
1278 my_rsv->rsv_end += size;
1279 else
1280 my_rsv->rsv_end = next_rsv->rsv_start - 1;
1281 }
1282 spin_unlock(rsv_lock);
1283}
1284
1285/**
1286 * ext3_try_to_allocate_with_rsv()
1287 * @sb: superblock
1288 * @handle: handle to this transaction
1289 * @group: given allocation block group
1290 * @bitmap_bh: bufferhead holds the block bitmap
1291 * @grp_goal: given target block within the group
1292 * @my_rsv: reservation window
1293 * @count: target number of blocks to allocate
1294 * @errp: pointer to store the error code
1295 *
1296 * This is the main function used to allocate a new block and its reservation
1297 * window.
1298 *
1299 * Each time when a new block allocation is need, first try to allocate from
1300 * its own reservation. If it does not have a reservation window, instead of
1301 * looking for a free bit on bitmap first, then look up the reservation list to
1302 * see if it is inside somebody else's reservation window, we try to allocate a
1303 * reservation window for it starting from the goal first. Then do the block
1304 * allocation within the reservation window.
1305 *
1306 * This will avoid keeping on searching the reservation list again and
1307 * again when somebody is looking for a free block (without
1308 * reservation), and there are lots of free blocks, but they are all
1309 * being reserved.
1310 *
1311 * We use a red-black tree for the per-filesystem reservation list.
1312 *
1313 */
1314static ext3_grpblk_t
1315ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1316 unsigned int group, struct buffer_head *bitmap_bh,
1317 ext3_grpblk_t grp_goal,
1318 struct ext3_reserve_window_node * my_rsv,
1319 unsigned long *count, int *errp)
1320{
1321 ext3_fsblk_t group_first_block, group_last_block;
1322 ext3_grpblk_t ret = 0;
1323 int fatal;
1324 unsigned long num = *count;
1325
1326 *errp = 0;
1327
1328 /*
1329 * Make sure we use undo access for the bitmap, because it is critical
1330 * that we do the frozen_data COW on bitmap buffers in all cases even
1331 * if the buffer is in BJ_Forget state in the committing transaction.
1332 */
1333 BUFFER_TRACE(bitmap_bh, "get undo access for new block");
1334 fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
1335 if (fatal) {
1336 *errp = fatal;
1337 return -1;
1338 }
1339
1340 /*
1341 * we don't deal with reservation when
1342 * filesystem is mounted without reservation
1343 * or the file is not a regular file
1344 * or last attempt to allocate a block with reservation turned on failed
1345 */
1346 if (my_rsv == NULL ) {
1347 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
1348 grp_goal, count, NULL);
1349 goto out;
1350 }
1351 /*
1352 * grp_goal is a group relative block number (if there is a goal)
1353 * 0 <= grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
1354 * first block is a filesystem wide block number
1355 * first block is the block number of the first block in this group
1356 */
1357 group_first_block = ext3_group_first_block_no(sb, group);
1358 group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
1359
1360 /*
1361 * Basically we will allocate a new block from inode's reservation
1362 * window.
1363 *
1364 * We need to allocate a new reservation window, if:
1365 * a) inode does not have a reservation window; or
1366 * b) last attempt to allocate a block from existing reservation
1367 * failed; or
1368 * c) we come here with a goal and with a reservation window
1369 *
1370 * We do not need to allocate a new reservation window if we come here
1371 * at the beginning with a goal and the goal is inside the window, or
1372 * we don't have a goal but already have a reservation window.
1373 * then we could go to allocate from the reservation window directly.
1374 */
1375 while (1) {
1376 if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
1377 !goal_in_my_reservation(&my_rsv->rsv_window,
1378 grp_goal, group, sb)) {
1379 if (my_rsv->rsv_goal_size < *count)
1380 my_rsv->rsv_goal_size = *count;
1381 ret = alloc_new_reservation(my_rsv, grp_goal, sb,
1382 group, bitmap_bh);
1383 if (ret < 0)
1384 break; /* failed */
1385
1386 if (!goal_in_my_reservation(&my_rsv->rsv_window,
1387 grp_goal, group, sb))
1388 grp_goal = -1;
1389 } else if (grp_goal >= 0) {
1390 int curr = my_rsv->rsv_end -
1391 (grp_goal + group_first_block) + 1;
1392
1393 if (curr < *count)
1394 try_to_extend_reservation(my_rsv, sb,
1395 *count - curr);
1396 }
1397
1398 if ((my_rsv->rsv_start > group_last_block) ||
1399 (my_rsv->rsv_end < group_first_block)) {
1400 rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1);
1401 BUG();
1402 }
1403 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
1404 grp_goal, &num, &my_rsv->rsv_window);
1405 if (ret >= 0) {
1406 my_rsv->rsv_alloc_hit += num;
1407 *count = num;
1408 break; /* succeed */
1409 }
1410 num = *count;
1411 }
1412out:
1413 if (ret >= 0) {
1414 BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
1415 "bitmap block");
1416 fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
1417 if (fatal) {
1418 *errp = fatal;
1419 return -1;
1420 }
1421 return ret;
1422 }
1423
1424 BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
1425 ext3_journal_release_buffer(handle, bitmap_bh);
1426 return ret;
1427}
1428
1429/**
1430 * ext3_has_free_blocks()
1431 * @sbi: in-core super block structure.
1432 *
1433 * Check if filesystem has at least 1 free block available for allocation.
1434 */
1435static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation)
1436{
1437 ext3_fsblk_t free_blocks, root_blocks;
1438
1439 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1440 root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
1441 if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
1442 !use_reservation && !uid_eq(sbi->s_resuid, current_fsuid()) &&
1443 (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) ||
1444 !in_group_p (sbi->s_resgid))) {
1445 return 0;
1446 }
1447 return 1;
1448}
1449
1450/**
1451 * ext3_should_retry_alloc()
1452 * @sb: super block
1453 * @retries number of attemps has been made
1454 *
1455 * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
1456 * it is profitable to retry the operation, this function will wait
1457 * for the current or committing transaction to complete, and then
1458 * return TRUE.
1459 *
1460 * if the total number of retries exceed three times, return FALSE.
1461 */
1462int ext3_should_retry_alloc(struct super_block *sb, int *retries)
1463{
1464 if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3)
1465 return 0;
1466
1467 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
1468
1469 return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
1470}
1471
1472/**
1473 * ext3_new_blocks() -- core block(s) allocation function
1474 * @handle: handle to this transaction
1475 * @inode: file inode
1476 * @goal: given target block(filesystem wide)
1477 * @count: target number of blocks to allocate
1478 * @errp: error code
1479 *
1480 * ext3_new_blocks uses a goal block to assist allocation. It tries to
1481 * allocate block(s) from the block group contains the goal block first. If that
1482 * fails, it will try to allocate block(s) from other block groups without
1483 * any specific goal block.
1484 *
1485 */
1486ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
1487 ext3_fsblk_t goal, unsigned long *count, int *errp)
1488{
1489 struct buffer_head *bitmap_bh = NULL;
1490 struct buffer_head *gdp_bh;
1491 int group_no;
1492 int goal_group;
1493 ext3_grpblk_t grp_target_blk; /* blockgroup relative goal block */
1494 ext3_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
1495 ext3_fsblk_t ret_block; /* filesyetem-wide allocated block */
1496 int bgi; /* blockgroup iteration index */
1497 int fatal = 0, err;
1498 int performed_allocation = 0;
1499 ext3_grpblk_t free_blocks; /* number of free blocks in a group */
1500 struct super_block *sb;
1501 struct ext3_group_desc *gdp;
1502 struct ext3_super_block *es;
1503 struct ext3_sb_info *sbi;
1504 struct ext3_reserve_window_node *my_rsv = NULL;
1505 struct ext3_block_alloc_info *block_i;
1506 unsigned short windowsz = 0;
1507#ifdef EXT3FS_DEBUG
1508 static int goal_hits, goal_attempts;
1509#endif
1510 unsigned long ngroups;
1511 unsigned long num = *count;
1512
1513 *errp = -ENOSPC;
1514 sb = inode->i_sb;
1515
1516 /*
1517 * Check quota for allocation of this block.
1518 */
1519 err = dquot_alloc_block(inode, num);
1520 if (err) {
1521 *errp = err;
1522 return 0;
1523 }
1524
1525 trace_ext3_request_blocks(inode, goal, num);
1526
1527 sbi = EXT3_SB(sb);
1528 es = sbi->s_es;
1529 ext3_debug("goal=%lu.\n", goal);
1530 /*
1531 * Allocate a block from reservation only when
1532 * filesystem is mounted with reservation(default,-o reservation), and
1533 * it's a regular file, and
1534 * the desired window size is greater than 0 (One could use ioctl
1535 * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off
1536 * reservation on that particular file)
1537 */
1538 block_i = EXT3_I(inode)->i_block_alloc_info;
1539 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1540 my_rsv = &block_i->rsv_window_node;
1541
1542 if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) {
1543 *errp = -ENOSPC;
1544 goto out;
1545 }
1546
1547 /*
1548 * First, test whether the goal block is free.
1549 */
1550 if (goal < le32_to_cpu(es->s_first_data_block) ||
1551 goal >= le32_to_cpu(es->s_blocks_count))
1552 goal = le32_to_cpu(es->s_first_data_block);
1553 group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
1554 EXT3_BLOCKS_PER_GROUP(sb);
1555 goal_group = group_no;
1556retry_alloc:
1557 gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
1558 if (!gdp)
1559 goto io_error;
1560
1561 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1562 /*
1563 * if there is not enough free blocks to make a new resevation
1564 * turn off reservation for this allocation
1565 */
1566 if (my_rsv && (free_blocks < windowsz)
1567 && (free_blocks > 0)
1568 && (rsv_is_empty(&my_rsv->rsv_window)))
1569 my_rsv = NULL;
1570
1571 if (free_blocks > 0) {
1572 grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
1573 EXT3_BLOCKS_PER_GROUP(sb));
1574 bitmap_bh = read_block_bitmap(sb, group_no);
1575 if (!bitmap_bh)
1576 goto io_error;
1577 grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
1578 group_no, bitmap_bh, grp_target_blk,
1579 my_rsv, &num, &fatal);
1580 if (fatal)
1581 goto out;
1582 if (grp_alloc_blk >= 0)
1583 goto allocated;
1584 }
1585
1586 ngroups = EXT3_SB(sb)->s_groups_count;
1587 smp_rmb();
1588
1589 /*
1590 * Now search the rest of the groups. We assume that
1591 * group_no and gdp correctly point to the last group visited.
1592 */
1593 for (bgi = 0; bgi < ngroups; bgi++) {
1594 group_no++;
1595 if (group_no >= ngroups)
1596 group_no = 0;
1597 gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
1598 if (!gdp)
1599 goto io_error;
1600 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1601 /*
1602 * skip this group (and avoid loading bitmap) if there
1603 * are no free blocks
1604 */
1605 if (!free_blocks)
1606 continue;
1607 /*
1608 * skip this group if the number of
1609 * free blocks is less than half of the reservation
1610 * window size.
1611 */
1612 if (my_rsv && (free_blocks <= (windowsz/2)))
1613 continue;
1614
1615 brelse(bitmap_bh);
1616 bitmap_bh = read_block_bitmap(sb, group_no);
1617 if (!bitmap_bh)
1618 goto io_error;
1619 /*
1620 * try to allocate block(s) from this group, without a goal(-1).
1621 */
1622 grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
1623 group_no, bitmap_bh, -1, my_rsv,
1624 &num, &fatal);
1625 if (fatal)
1626 goto out;
1627 if (grp_alloc_blk >= 0)
1628 goto allocated;
1629 }
1630 /*
1631 * We may end up a bogus earlier ENOSPC error due to
1632 * filesystem is "full" of reservations, but
1633 * there maybe indeed free blocks available on disk
1634 * In this case, we just forget about the reservations
1635 * just do block allocation as without reservations.
1636 */
1637 if (my_rsv) {
1638 my_rsv = NULL;
1639 windowsz = 0;
1640 group_no = goal_group;
1641 goto retry_alloc;
1642 }
1643 /* No space left on the device */
1644 *errp = -ENOSPC;
1645 goto out;
1646
1647allocated:
1648
1649 ext3_debug("using block group %d(%d)\n",
1650 group_no, gdp->bg_free_blocks_count);
1651
1652 BUFFER_TRACE(gdp_bh, "get_write_access");
1653 fatal = ext3_journal_get_write_access(handle, gdp_bh);
1654 if (fatal)
1655 goto out;
1656
1657 ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
1658
1659 if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
1660 in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
1661 in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
1662 EXT3_SB(sb)->s_itb_per_group) ||
1663 in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
1664 EXT3_SB(sb)->s_itb_per_group)) {
1665 ext3_error(sb, "ext3_new_block",
1666 "Allocating block in system zone - "
1667 "blocks from "E3FSBLK", length %lu",
1668 ret_block, num);
1669 /*
1670 * claim_block() marked the blocks we allocated as in use. So we
1671 * may want to selectively mark some of the blocks as free.
1672 */
1673 goto retry_alloc;
1674 }
1675
1676 performed_allocation = 1;
1677
1678#ifdef CONFIG_JBD_DEBUG
1679 {
1680 struct buffer_head *debug_bh;
1681
1682 /* Record bitmap buffer state in the newly allocated block */
1683 debug_bh = sb_find_get_block(sb, ret_block);
1684 if (debug_bh) {
1685 BUFFER_TRACE(debug_bh, "state when allocated");
1686 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
1687 brelse(debug_bh);
1688 }
1689 }
1690 jbd_lock_bh_state(bitmap_bh);
1691 spin_lock(sb_bgl_lock(sbi, group_no));
1692 if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
1693 int i;
1694
1695 for (i = 0; i < num; i++) {
1696 if (ext3_test_bit(grp_alloc_blk+i,
1697 bh2jh(bitmap_bh)->b_committed_data)) {
1698 printk("%s: block was unexpectedly set in "
1699 "b_committed_data\n", __func__);
1700 }
1701 }
1702 }
1703 ext3_debug("found bit %d\n", grp_alloc_blk);
1704 spin_unlock(sb_bgl_lock(sbi, group_no));
1705 jbd_unlock_bh_state(bitmap_bh);
1706#endif
1707
1708 if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
1709 ext3_error(sb, "ext3_new_block",
1710 "block("E3FSBLK") >= blocks count(%d) - "
1711 "block_group = %d, es == %p ", ret_block,
1712 le32_to_cpu(es->s_blocks_count), group_no, es);
1713 goto out;
1714 }
1715
1716 /*
1717 * It is up to the caller to add the new buffer to a journal
1718 * list of some description. We don't know in advance whether
1719 * the caller wants to use it as metadata or data.
1720 */
1721 ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
1722 ret_block, goal_hits, goal_attempts);
1723
1724 spin_lock(sb_bgl_lock(sbi, group_no));
1725 le16_add_cpu(&gdp->bg_free_blocks_count, -num);
1726 spin_unlock(sb_bgl_lock(sbi, group_no));
1727 percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1728
1729 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1730 fatal = ext3_journal_dirty_metadata(handle, gdp_bh);
1731 if (fatal)
1732 goto out;
1733
1734 *errp = 0;
1735 brelse(bitmap_bh);
1736
1737 if (num < *count) {
1738 dquot_free_block(inode, *count-num);
1739 *count = num;
1740 }
1741
1742 trace_ext3_allocate_blocks(inode, goal, num,
1743 (unsigned long long)ret_block);
1744
1745 return ret_block;
1746
1747io_error:
1748 *errp = -EIO;
1749out:
1750 if (fatal) {
1751 *errp = fatal;
1752 ext3_std_error(sb, fatal);
1753 }
1754 /*
1755 * Undo the block allocation
1756 */
1757 if (!performed_allocation)
1758 dquot_free_block(inode, *count);
1759 brelse(bitmap_bh);
1760 return 0;
1761}
1762
1763ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
1764 ext3_fsblk_t goal, int *errp)
1765{
1766 unsigned long count = 1;
1767
1768 return ext3_new_blocks(handle, inode, goal, &count, errp);
1769}
1770
1771/**
1772 * ext3_count_free_blocks() -- count filesystem free blocks
1773 * @sb: superblock
1774 *
1775 * Adds up the number of free blocks from each block group.
1776 */
1777ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
1778{
1779 ext3_fsblk_t desc_count;
1780 struct ext3_group_desc *gdp;
1781 int i;
1782 unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
1783#ifdef EXT3FS_DEBUG
1784 struct ext3_super_block *es;
1785 ext3_fsblk_t bitmap_count;
1786 unsigned long x;
1787 struct buffer_head *bitmap_bh = NULL;
1788
1789 es = EXT3_SB(sb)->s_es;
1790 desc_count = 0;
1791 bitmap_count = 0;
1792 gdp = NULL;
1793
1794 smp_rmb();
1795 for (i = 0; i < ngroups; i++) {
1796 gdp = ext3_get_group_desc(sb, i, NULL);
1797 if (!gdp)
1798 continue;
1799 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1800 brelse(bitmap_bh);
1801 bitmap_bh = read_block_bitmap(sb, i);
1802 if (bitmap_bh == NULL)
1803 continue;
1804
1805 x = ext3_count_free(bitmap_bh, sb->s_blocksize);
1806 printk("group %d: stored = %d, counted = %lu\n",
1807 i, le16_to_cpu(gdp->bg_free_blocks_count), x);
1808 bitmap_count += x;
1809 }
1810 brelse(bitmap_bh);
1811 printk("ext3_count_free_blocks: stored = "E3FSBLK
1812 ", computed = "E3FSBLK", "E3FSBLK"\n",
1813 (ext3_fsblk_t)le32_to_cpu(es->s_free_blocks_count),
1814 desc_count, bitmap_count);
1815 return bitmap_count;
1816#else
1817 desc_count = 0;
1818 smp_rmb();
1819 for (i = 0; i < ngroups; i++) {
1820 gdp = ext3_get_group_desc(sb, i, NULL);
1821 if (!gdp)
1822 continue;
1823 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1824 }
1825
1826 return desc_count;
1827#endif
1828}
1829
1830static inline int test_root(int a, int b)
1831{
1832 int num = b;
1833
1834 while (a > num)
1835 num *= b;
1836 return num == a;
1837}
1838
1839static int ext3_group_sparse(int group)
1840{
1841 if (group <= 1)
1842 return 1;
1843 if (!(group & 1))
1844 return 0;
1845 return (test_root(group, 7) || test_root(group, 5) ||
1846 test_root(group, 3));
1847}
1848
1849/**
1850 * ext3_bg_has_super - number of blocks used by the superblock in group
1851 * @sb: superblock for filesystem
1852 * @group: group number to check
1853 *
1854 * Return the number of blocks used by the superblock (primary or backup)
1855 * in this group. Currently this will be only 0 or 1.
1856 */
1857int ext3_bg_has_super(struct super_block *sb, int group)
1858{
1859 if (EXT3_HAS_RO_COMPAT_FEATURE(sb,
1860 EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
1861 !ext3_group_sparse(group))
1862 return 0;
1863 return 1;
1864}
1865
1866static unsigned long ext3_bg_num_gdb_meta(struct super_block *sb, int group)
1867{
1868 unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
1869 unsigned long first = metagroup * EXT3_DESC_PER_BLOCK(sb);
1870 unsigned long last = first + EXT3_DESC_PER_BLOCK(sb) - 1;
1871
1872 if (group == first || group == first + 1 || group == last)
1873 return 1;
1874 return 0;
1875}
1876
1877static unsigned long ext3_bg_num_gdb_nometa(struct super_block *sb, int group)
1878{
1879 return ext3_bg_has_super(sb, group) ? EXT3_SB(sb)->s_gdb_count : 0;
1880}
1881
1882/**
1883 * ext3_bg_num_gdb - number of blocks used by the group table in group
1884 * @sb: superblock for filesystem
1885 * @group: group number to check
1886 *
1887 * Return the number of blocks used by the group descriptor table
1888 * (primary or backup) in this group. In the future there may be a
1889 * different number of descriptor blocks in each group.
1890 */
1891unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
1892{
1893 unsigned long first_meta_bg =
1894 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg);
1895 unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
1896
1897 if (!EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) ||
1898 metagroup < first_meta_bg)
1899 return ext3_bg_num_gdb_nometa(sb,group);
1900
1901 return ext3_bg_num_gdb_meta(sb,group);
1902
1903}
1904
1905/**
1906 * ext3_trim_all_free -- function to trim all free space in alloc. group
1907 * @sb: super block for file system
1908 * @group: allocation group to trim
1909 * @start: first group block to examine
1910 * @max: last group block to examine
1911 * @gdp: allocation group description structure
1912 * @minblocks: minimum extent block count
1913 *
1914 * ext3_trim_all_free walks through group's block bitmap searching for free
1915 * blocks. When the free block is found, it tries to allocate this block and
1916 * consequent free block to get the biggest free extent possible, until it
1917 * reaches any used block. Then issue a TRIM command on this extent and free
1918 * the extent in the block bitmap. This is done until whole group is scanned.
1919 */
1920static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
1921 unsigned int group,
1922 ext3_grpblk_t start, ext3_grpblk_t max,
1923 ext3_grpblk_t minblocks)
1924{
1925 handle_t *handle;
1926 ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
1927 ext3_fsblk_t discard_block;
1928 struct ext3_sb_info *sbi;
1929 struct buffer_head *gdp_bh, *bitmap_bh = NULL;
1930 struct ext3_group_desc *gdp;
1931 int err = 0, ret = 0;
1932
1933 /*
1934 * We will update one block bitmap, and one group descriptor
1935 */
1936 handle = ext3_journal_start_sb(sb, 2);
1937 if (IS_ERR(handle))
1938 return PTR_ERR(handle);
1939
1940 bitmap_bh = read_block_bitmap(sb, group);
1941 if (!bitmap_bh) {
1942 err = -EIO;
1943 goto err_out;
1944 }
1945
1946 BUFFER_TRACE(bitmap_bh, "getting undo access");
1947 err = ext3_journal_get_undo_access(handle, bitmap_bh);
1948 if (err)
1949 goto err_out;
1950
1951 gdp = ext3_get_group_desc(sb, group, &gdp_bh);
1952 if (!gdp) {
1953 err = -EIO;
1954 goto err_out;
1955 }
1956
1957 BUFFER_TRACE(gdp_bh, "get_write_access");
1958 err = ext3_journal_get_write_access(handle, gdp_bh);
1959 if (err)
1960 goto err_out;
1961
1962 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1963 sbi = EXT3_SB(sb);
1964
1965 /* Walk through the whole group */
1966 while (start <= max) {
1967 start = bitmap_search_next_usable_block(start, bitmap_bh, max);
1968 if (start < 0)
1969 break;
1970 next = start;
1971
1972 /*
1973 * Allocate contiguous free extents by setting bits in the
1974 * block bitmap
1975 */
1976 while (next <= max
1977 && claim_block(sb_bgl_lock(sbi, group),
1978 next, bitmap_bh)) {
1979 next++;
1980 }
1981
1982 /* We did not claim any blocks */
1983 if (next == start)
1984 continue;
1985
1986 discard_block = (ext3_fsblk_t)start +
1987 ext3_group_first_block_no(sb, group);
1988
1989 /* Update counters */
1990 spin_lock(sb_bgl_lock(sbi, group));
1991 le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
1992 spin_unlock(sb_bgl_lock(sbi, group));
1993 percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
1994
1995 free_blocks -= next - start;
1996 /* Do not issue a TRIM on extents smaller than minblocks */
1997 if ((next - start) < minblocks)
1998 goto free_extent;
1999
2000 trace_ext3_discard_blocks(sb, discard_block, next - start);
2001 /* Send the TRIM command down to the device */
2002 err = sb_issue_discard(sb, discard_block, next - start,
2003 GFP_NOFS, 0);
2004 count += (next - start);
2005free_extent:
2006 freed = 0;
2007
2008 /*
2009 * Clear bits in the bitmap
2010 */
2011 for (bit = start; bit < next; bit++) {
2012 BUFFER_TRACE(bitmap_bh, "clear bit");
2013 if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
2014 bit, bitmap_bh->b_data)) {
2015 ext3_error(sb, __func__,
2016 "bit already cleared for block "E3FSBLK,
2017 (unsigned long)bit);
2018 BUFFER_TRACE(bitmap_bh, "bit already cleared");
2019 } else {
2020 freed++;
2021 }
2022 }
2023
2024 /* Update couters */
2025 spin_lock(sb_bgl_lock(sbi, group));
2026 le16_add_cpu(&gdp->bg_free_blocks_count, freed);
2027 spin_unlock(sb_bgl_lock(sbi, group));
2028 percpu_counter_add(&sbi->s_freeblocks_counter, freed);
2029
2030 start = next;
2031 if (err < 0) {
2032 if (err != -EOPNOTSUPP)
2033 ext3_warning(sb, __func__, "Discard command "
2034 "returned error %d\n", err);
2035 break;
2036 }
2037
2038 if (fatal_signal_pending(current)) {
2039 err = -ERESTARTSYS;
2040 break;
2041 }
2042
2043 cond_resched();
2044
2045 /* No more suitable extents */
2046 if (free_blocks < minblocks)
2047 break;
2048 }
2049
2050 /* We dirtied the bitmap block */
2051 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
2052 ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
2053 if (!err)
2054 err = ret;
2055
2056 /* And the group descriptor block */
2057 BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
2058 ret = ext3_journal_dirty_metadata(handle, gdp_bh);
2059 if (!err)
2060 err = ret;
2061
2062 ext3_debug("trimmed %d blocks in the group %d\n",
2063 count, group);
2064
2065err_out:
2066 if (err)
2067 count = err;
2068 ext3_journal_stop(handle);
2069 brelse(bitmap_bh);
2070
2071 return count;
2072}
2073
2074/**
2075 * ext3_trim_fs() -- trim ioctl handle function
2076 * @sb: superblock for filesystem
2077 * @start: First Byte to trim
2078 * @len: number of Bytes to trim from start
2079 * @minlen: minimum extent length in Bytes
2080 *
2081 * ext3_trim_fs goes through all allocation groups containing Bytes from
2082 * start to start+len. For each such a group ext3_trim_all_free function
2083 * is invoked to trim all free space.
2084 */
2085int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2086{
2087 ext3_grpblk_t last_block, first_block;
2088 unsigned long group, first_group, last_group;
2089 struct ext3_group_desc *gdp;
2090 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
2091 uint64_t start, minlen, end, trimmed = 0;
2092 ext3_fsblk_t first_data_blk =
2093 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
2094 ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
2095 int ret = 0;
2096
2097 start = range->start >> sb->s_blocksize_bits;
2098 end = start + (range->len >> sb->s_blocksize_bits) - 1;
2099 minlen = range->minlen >> sb->s_blocksize_bits;
2100
2101 if (minlen > EXT3_BLOCKS_PER_GROUP(sb) ||
2102 start >= max_blks ||
2103 range->len < sb->s_blocksize)
2104 return -EINVAL;
2105 if (end >= max_blks)
2106 end = max_blks - 1;
2107 if (end <= first_data_blk)
2108 goto out;
2109 if (start < first_data_blk)
2110 start = first_data_blk;
2111
2112 smp_rmb();
2113
2114 /* Determine first and last group to examine based on start and len */
2115 ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
2116 &first_group, &first_block);
2117 ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end,
2118 &last_group, &last_block);
2119
2120 /* end now represents the last block to discard in this group */
2121 end = EXT3_BLOCKS_PER_GROUP(sb) - 1;
2122
2123 for (group = first_group; group <= last_group; group++) {
2124 gdp = ext3_get_group_desc(sb, group, NULL);
2125 if (!gdp)
2126 break;
2127
2128 /*
2129 * For all the groups except the last one, last block will
2130 * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to
2131 * change it for the last group, note that last_block is
2132 * already computed earlier by ext3_get_group_no_and_offset()
2133 */
2134 if (group == last_group)
2135 end = last_block;
2136
2137 if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) {
2138 ret = ext3_trim_all_free(sb, group, first_block,
2139 end, minlen);
2140 if (ret < 0)
2141 break;
2142 trimmed += ret;
2143 }
2144
2145 /*
2146 * For every group except the first one, we are sure
2147 * that the first block to discard will be block #0.
2148 */
2149 first_block = 0;
2150 }
2151
2152 if (ret > 0)
2153 ret = 0;
2154
2155out:
2156 range->len = trimmed * sb->s_blocksize;
2157 return ret;
2158}
diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c
deleted file mode 100644
index ef9c643e8e9d..000000000000
--- a/fs/ext3/bitmap.c
+++ /dev/null
@@ -1,20 +0,0 @@
1/*
2 * linux/fs/ext3/bitmap.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 */
9
10#include "ext3.h"
11
12#ifdef EXT3FS_DEBUG
13
14unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
15{
16 return numchars * BITS_PER_BYTE - memweight(map->b_data, numchars);
17}
18
19#endif /* EXT3FS_DEBUG */
20
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
deleted file mode 100644
index 17742eed2c16..000000000000
--- a/fs/ext3/dir.c
+++ /dev/null
@@ -1,537 +0,0 @@
1/*
2 * linux/fs/ext3/dir.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/dir.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * ext3 directory handling functions
16 *
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 *
20 * Hash Tree Directory indexing (c) 2001 Daniel Phillips
21 *
22 */
23
24#include <linux/compat.h>
25#include "ext3.h"
26
27static unsigned char ext3_filetype_table[] = {
28 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
29};
30
31static int ext3_dx_readdir(struct file *, struct dir_context *);
32
33static unsigned char get_dtype(struct super_block *sb, int filetype)
34{
35 if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
36 (filetype >= EXT3_FT_MAX))
37 return DT_UNKNOWN;
38
39 return (ext3_filetype_table[filetype]);
40}
41
42/**
43 * Check if the given dir-inode refers to an htree-indexed directory
44 * (or a directory which could potentially get converted to use htree
45 * indexing).
46 *
47 * Return 1 if it is a dx dir, 0 if not
48 */
49static int is_dx_dir(struct inode *inode)
50{
51 struct super_block *sb = inode->i_sb;
52
53 if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
54 EXT3_FEATURE_COMPAT_DIR_INDEX) &&
55 ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
56 ((inode->i_size >> sb->s_blocksize_bits) == 1)))
57 return 1;
58
59 return 0;
60}
61
62int ext3_check_dir_entry (const char * function, struct inode * dir,
63 struct ext3_dir_entry_2 * de,
64 struct buffer_head * bh,
65 unsigned long offset)
66{
67 const char * error_msg = NULL;
68 const int rlen = ext3_rec_len_from_disk(de->rec_len);
69
70 if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
71 error_msg = "rec_len is smaller than minimal";
72 else if (unlikely(rlen % 4 != 0))
73 error_msg = "rec_len % 4 != 0";
74 else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
75 error_msg = "rec_len is too small for name_len";
76 else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
77 error_msg = "directory entry across blocks";
78 else if (unlikely(le32_to_cpu(de->inode) >
79 le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
80 error_msg = "inode out of bounds";
81
82 if (unlikely(error_msg != NULL))
83 ext3_error (dir->i_sb, function,
84 "bad entry in directory #%lu: %s - "
85 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
86 dir->i_ino, error_msg, offset,
87 (unsigned long) le32_to_cpu(de->inode),
88 rlen, de->name_len);
89
90 return error_msg == NULL ? 1 : 0;
91}
92
93static int ext3_readdir(struct file *file, struct dir_context *ctx)
94{
95 unsigned long offset;
96 int i;
97 struct ext3_dir_entry_2 *de;
98 int err;
99 struct inode *inode = file_inode(file);
100 struct super_block *sb = inode->i_sb;
101 int dir_has_error = 0;
102
103 if (is_dx_dir(inode)) {
104 err = ext3_dx_readdir(file, ctx);
105 if (err != ERR_BAD_DX_DIR)
106 return err;
107 /*
108 * We don't set the inode dirty flag since it's not
109 * critical that it get flushed back to the disk.
110 */
111 EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
112 }
113 offset = ctx->pos & (sb->s_blocksize - 1);
114
115 while (ctx->pos < inode->i_size) {
116 unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb);
117 struct buffer_head map_bh;
118 struct buffer_head *bh = NULL;
119
120 map_bh.b_state = 0;
121 err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
122 if (err > 0) {
123 pgoff_t index = map_bh.b_blocknr >>
124 (PAGE_CACHE_SHIFT - inode->i_blkbits);
125 if (!ra_has_index(&file->f_ra, index))
126 page_cache_sync_readahead(
127 sb->s_bdev->bd_inode->i_mapping,
128 &file->f_ra, file,
129 index, 1);
130 file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
131 bh = ext3_bread(NULL, inode, blk, 0, &err);
132 }
133
134 /*
135 * We ignore I/O errors on directories so users have a chance
136 * of recovering data when there's a bad sector
137 */
138 if (!bh) {
139 if (!dir_has_error) {
140 ext3_error(sb, __func__, "directory #%lu "
141 "contains a hole at offset %lld",
142 inode->i_ino, ctx->pos);
143 dir_has_error = 1;
144 }
145 /* corrupt size? Maybe no more blocks to read */
146 if (ctx->pos > inode->i_blocks << 9)
147 break;
148 ctx->pos += sb->s_blocksize - offset;
149 continue;
150 }
151
152 /* If the dir block has changed since the last call to
153 * readdir(2), then we might be pointing to an invalid
154 * dirent right now. Scan from the start of the block
155 * to make sure. */
156 if (offset && file->f_version != inode->i_version) {
157 for (i = 0; i < sb->s_blocksize && i < offset; ) {
158 de = (struct ext3_dir_entry_2 *)
159 (bh->b_data + i);
160 /* It's too expensive to do a full
161 * dirent test each time round this
162 * loop, but we do have to test at
163 * least that it is non-zero. A
164 * failure will be detected in the
165 * dirent test below. */
166 if (ext3_rec_len_from_disk(de->rec_len) <
167 EXT3_DIR_REC_LEN(1))
168 break;
169 i += ext3_rec_len_from_disk(de->rec_len);
170 }
171 offset = i;
172 ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
173 | offset;
174 file->f_version = inode->i_version;
175 }
176
177 while (ctx->pos < inode->i_size
178 && offset < sb->s_blocksize) {
179 de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
180 if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
181 bh, offset)) {
182 /* On error, skip the to the
183 next block. */
184 ctx->pos = (ctx->pos |
185 (sb->s_blocksize - 1)) + 1;
186 break;
187 }
188 offset += ext3_rec_len_from_disk(de->rec_len);
189 if (le32_to_cpu(de->inode)) {
190 if (!dir_emit(ctx, de->name, de->name_len,
191 le32_to_cpu(de->inode),
192 get_dtype(sb, de->file_type))) {
193 brelse(bh);
194 return 0;
195 }
196 }
197 ctx->pos += ext3_rec_len_from_disk(de->rec_len);
198 }
199 offset = 0;
200 brelse (bh);
201 if (ctx->pos < inode->i_size)
202 if (!dir_relax(inode))
203 return 0;
204 }
205 return 0;
206}
207
208static inline int is_32bit_api(void)
209{
210#ifdef CONFIG_COMPAT
211 return is_compat_task();
212#else
213 return (BITS_PER_LONG == 32);
214#endif
215}
216
217/*
218 * These functions convert from the major/minor hash to an f_pos
219 * value for dx directories
220 *
221 * Upper layer (for example NFS) should specify FMODE_32BITHASH or
222 * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted
223 * directly on both 32-bit and 64-bit nodes, under such case, neither
224 * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
225 */
226static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
227{
228 if ((filp->f_mode & FMODE_32BITHASH) ||
229 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
230 return major >> 1;
231 else
232 return ((__u64)(major >> 1) << 32) | (__u64)minor;
233}
234
235static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
236{
237 if ((filp->f_mode & FMODE_32BITHASH) ||
238 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
239 return (pos << 1) & 0xffffffff;
240 else
241 return ((pos >> 32) << 1) & 0xffffffff;
242}
243
244static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
245{
246 if ((filp->f_mode & FMODE_32BITHASH) ||
247 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
248 return 0;
249 else
250 return pos & 0xffffffff;
251}
252
253/*
254 * Return 32- or 64-bit end-of-file for dx directories
255 */
256static inline loff_t ext3_get_htree_eof(struct file *filp)
257{
258 if ((filp->f_mode & FMODE_32BITHASH) ||
259 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
260 return EXT3_HTREE_EOF_32BIT;
261 else
262 return EXT3_HTREE_EOF_64BIT;
263}
264
265
266/*
267 * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both
268 * non-htree and htree directories, where the "offset" is in terms
269 * of the filename hash value instead of the byte offset.
270 *
271 * Because we may return a 64-bit hash that is well beyond s_maxbytes,
272 * we need to pass the max hash as the maximum allowable offset in
273 * the htree directory case.
274 *
275 * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
276 * will be invalid once the directory was converted into a dx directory
277 */
278static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
279{
280 struct inode *inode = file->f_mapping->host;
281 int dx_dir = is_dx_dir(inode);
282 loff_t htree_max = ext3_get_htree_eof(file);
283
284 if (likely(dx_dir))
285 return generic_file_llseek_size(file, offset, whence,
286 htree_max, htree_max);
287 else
288 return generic_file_llseek(file, offset, whence);
289}
290
291/*
292 * This structure holds the nodes of the red-black tree used to store
293 * the directory entry in hash order.
294 */
295struct fname {
296 __u32 hash;
297 __u32 minor_hash;
298 struct rb_node rb_hash;
299 struct fname *next;
300 __u32 inode;
301 __u8 name_len;
302 __u8 file_type;
303 char name[0];
304};
305
306/*
307 * This functoin implements a non-recursive way of freeing all of the
308 * nodes in the red-black tree.
309 */
310static void free_rb_tree_fname(struct rb_root *root)
311{
312 struct fname *fname, *next;
313
314 rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
315 do {
316 struct fname *old = fname;
317 fname = fname->next;
318 kfree(old);
319 } while (fname);
320
321 *root = RB_ROOT;
322}
323
324static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
325 loff_t pos)
326{
327 struct dir_private_info *p;
328
329 p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
330 if (!p)
331 return NULL;
332 p->curr_hash = pos2maj_hash(filp, pos);
333 p->curr_minor_hash = pos2min_hash(filp, pos);
334 return p;
335}
336
337void ext3_htree_free_dir_info(struct dir_private_info *p)
338{
339 free_rb_tree_fname(&p->root);
340 kfree(p);
341}
342
343/*
344 * Given a directory entry, enter it into the fname rb tree.
345 */
346int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
347 __u32 minor_hash,
348 struct ext3_dir_entry_2 *dirent)
349{
350 struct rb_node **p, *parent = NULL;
351 struct fname * fname, *new_fn;
352 struct dir_private_info *info;
353 int len;
354
355 info = (struct dir_private_info *) dir_file->private_data;
356 p = &info->root.rb_node;
357
358 /* Create and allocate the fname structure */
359 len = sizeof(struct fname) + dirent->name_len + 1;
360 new_fn = kzalloc(len, GFP_KERNEL);
361 if (!new_fn)
362 return -ENOMEM;
363 new_fn->hash = hash;
364 new_fn->minor_hash = minor_hash;
365 new_fn->inode = le32_to_cpu(dirent->inode);
366 new_fn->name_len = dirent->name_len;
367 new_fn->file_type = dirent->file_type;
368 memcpy(new_fn->name, dirent->name, dirent->name_len);
369 new_fn->name[dirent->name_len] = 0;
370
371 while (*p) {
372 parent = *p;
373 fname = rb_entry(parent, struct fname, rb_hash);
374
375 /*
376 * If the hash and minor hash match up, then we put
377 * them on a linked list. This rarely happens...
378 */
379 if ((new_fn->hash == fname->hash) &&
380 (new_fn->minor_hash == fname->minor_hash)) {
381 new_fn->next = fname->next;
382 fname->next = new_fn;
383 return 0;
384 }
385
386 if (new_fn->hash < fname->hash)
387 p = &(*p)->rb_left;
388 else if (new_fn->hash > fname->hash)
389 p = &(*p)->rb_right;
390 else if (new_fn->minor_hash < fname->minor_hash)
391 p = &(*p)->rb_left;
392 else /* if (new_fn->minor_hash > fname->minor_hash) */
393 p = &(*p)->rb_right;
394 }
395
396 rb_link_node(&new_fn->rb_hash, parent, p);
397 rb_insert_color(&new_fn->rb_hash, &info->root);
398 return 0;
399}
400
401
402
403/*
404 * This is a helper function for ext3_dx_readdir. It calls filldir
405 * for all entres on the fname linked list. (Normally there is only
406 * one entry on the linked list, unless there are 62 bit hash collisions.)
407 */
408static bool call_filldir(struct file *file, struct dir_context *ctx,
409 struct fname *fname)
410{
411 struct dir_private_info *info = file->private_data;
412 struct inode *inode = file_inode(file);
413 struct super_block *sb = inode->i_sb;
414
415 if (!fname) {
416 printk("call_filldir: called with null fname?!?\n");
417 return true;
418 }
419 ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
420 while (fname) {
421 if (!dir_emit(ctx, fname->name, fname->name_len,
422 fname->inode,
423 get_dtype(sb, fname->file_type))) {
424 info->extra_fname = fname;
425 return false;
426 }
427 fname = fname->next;
428 }
429 return true;
430}
431
432static int ext3_dx_readdir(struct file *file, struct dir_context *ctx)
433{
434 struct dir_private_info *info = file->private_data;
435 struct inode *inode = file_inode(file);
436 struct fname *fname;
437 int ret;
438
439 if (!info) {
440 info = ext3_htree_create_dir_info(file, ctx->pos);
441 if (!info)
442 return -ENOMEM;
443 file->private_data = info;
444 }
445
446 if (ctx->pos == ext3_get_htree_eof(file))
447 return 0; /* EOF */
448
449 /* Some one has messed with f_pos; reset the world */
450 if (info->last_pos != ctx->pos) {
451 free_rb_tree_fname(&info->root);
452 info->curr_node = NULL;
453 info->extra_fname = NULL;
454 info->curr_hash = pos2maj_hash(file, ctx->pos);
455 info->curr_minor_hash = pos2min_hash(file, ctx->pos);
456 }
457
458 /*
459 * If there are any leftover names on the hash collision
460 * chain, return them first.
461 */
462 if (info->extra_fname) {
463 if (!call_filldir(file, ctx, info->extra_fname))
464 goto finished;
465 info->extra_fname = NULL;
466 goto next_node;
467 } else if (!info->curr_node)
468 info->curr_node = rb_first(&info->root);
469
470 while (1) {
471 /*
472 * Fill the rbtree if we have no more entries,
473 * or the inode has changed since we last read in the
474 * cached entries.
475 */
476 if ((!info->curr_node) ||
477 (file->f_version != inode->i_version)) {
478 info->curr_node = NULL;
479 free_rb_tree_fname(&info->root);
480 file->f_version = inode->i_version;
481 ret = ext3_htree_fill_tree(file, info->curr_hash,
482 info->curr_minor_hash,
483 &info->next_hash);
484 if (ret < 0)
485 return ret;
486 if (ret == 0) {
487 ctx->pos = ext3_get_htree_eof(file);
488 break;
489 }
490 info->curr_node = rb_first(&info->root);
491 }
492
493 fname = rb_entry(info->curr_node, struct fname, rb_hash);
494 info->curr_hash = fname->hash;
495 info->curr_minor_hash = fname->minor_hash;
496 if (!call_filldir(file, ctx, fname))
497 break;
498 next_node:
499 info->curr_node = rb_next(info->curr_node);
500 if (info->curr_node) {
501 fname = rb_entry(info->curr_node, struct fname,
502 rb_hash);
503 info->curr_hash = fname->hash;
504 info->curr_minor_hash = fname->minor_hash;
505 } else {
506 if (info->next_hash == ~0) {
507 ctx->pos = ext3_get_htree_eof(file);
508 break;
509 }
510 info->curr_hash = info->next_hash;
511 info->curr_minor_hash = 0;
512 }
513 }
514finished:
515 info->last_pos = ctx->pos;
516 return 0;
517}
518
519static int ext3_release_dir (struct inode * inode, struct file * filp)
520{
521 if (filp->private_data)
522 ext3_htree_free_dir_info(filp->private_data);
523
524 return 0;
525}
526
527const struct file_operations ext3_dir_operations = {
528 .llseek = ext3_dir_llseek,
529 .read = generic_read_dir,
530 .iterate = ext3_readdir,
531 .unlocked_ioctl = ext3_ioctl,
532#ifdef CONFIG_COMPAT
533 .compat_ioctl = ext3_compat_ioctl,
534#endif
535 .fsync = ext3_sync_file,
536 .release = ext3_release_dir,
537};
diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
deleted file mode 100644
index f483a80b3fe7..000000000000
--- a/fs/ext3/ext3.h
+++ /dev/null
@@ -1,1332 +0,0 @@
1/*
2 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
3 *
4 * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
5 *
6 * This file is part of the Linux kernel and is made available under
7 * the terms of the GNU General Public License, version 2, or at your
8 * option, any later version, incorporated herein by reference.
9 *
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 *
15 * from
16 *
17 * linux/include/linux/minix_fs.h
18 *
19 * Copyright (C) 1991, 1992 Linus Torvalds
20 */
21
22#include <linux/fs.h>
23#include <linux/jbd.h>
24#include <linux/magic.h>
25#include <linux/bug.h>
26#include <linux/blockgroup_lock.h>
27
28/*
29 * The second extended filesystem constants/structures
30 */
31
32/*
33 * Define EXT3FS_DEBUG to produce debug messages
34 */
35#undef EXT3FS_DEBUG
36
37/*
38 * Define EXT3_RESERVATION to reserve data blocks for expanding files
39 */
40#define EXT3_DEFAULT_RESERVE_BLOCKS 8
41/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
42#define EXT3_MAX_RESERVE_BLOCKS 1027
43#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0
44
45/*
46 * Debug code
47 */
48#ifdef EXT3FS_DEBUG
49#define ext3_debug(f, a...) \
50 do { \
51 printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:", \
52 __FILE__, __LINE__, __func__); \
53 printk (KERN_DEBUG f, ## a); \
54 } while (0)
55#else
56#define ext3_debug(f, a...) do {} while (0)
57#endif
58
59/*
60 * Special inodes numbers
61 */
62#define EXT3_BAD_INO 1 /* Bad blocks inode */
63#define EXT3_ROOT_INO 2 /* Root inode */
64#define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */
65#define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */
66#define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */
67#define EXT3_JOURNAL_INO 8 /* Journal inode */
68
69/* First non-reserved inode for old ext3 filesystems */
70#define EXT3_GOOD_OLD_FIRST_INO 11
71
72/*
73 * Maximal count of links to a file
74 */
75#define EXT3_LINK_MAX 32000
76
77/*
78 * Macro-instructions used to manage several block sizes
79 */
80#define EXT3_MIN_BLOCK_SIZE 1024
81#define EXT3_MAX_BLOCK_SIZE 65536
82#define EXT3_MIN_BLOCK_LOG_SIZE 10
83#define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize)
84#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
85#define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
86#define EXT3_ADDR_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_addr_per_block_bits)
87#define EXT3_INODE_SIZE(s) (EXT3_SB(s)->s_inode_size)
88#define EXT3_FIRST_INO(s) (EXT3_SB(s)->s_first_ino)
89
90/*
91 * Macro-instructions used to manage fragments
92 */
93#define EXT3_MIN_FRAG_SIZE 1024
94#define EXT3_MAX_FRAG_SIZE 4096
95#define EXT3_MIN_FRAG_LOG_SIZE 10
96#define EXT3_FRAG_SIZE(s) (EXT3_SB(s)->s_frag_size)
97#define EXT3_FRAGS_PER_BLOCK(s) (EXT3_SB(s)->s_frags_per_block)
98
99/*
100 * Structure of a blocks group descriptor
101 */
102struct ext3_group_desc
103{
104 __le32 bg_block_bitmap; /* Blocks bitmap block */
105 __le32 bg_inode_bitmap; /* Inodes bitmap block */
106 __le32 bg_inode_table; /* Inodes table block */
107 __le16 bg_free_blocks_count; /* Free blocks count */
108 __le16 bg_free_inodes_count; /* Free inodes count */
109 __le16 bg_used_dirs_count; /* Directories count */
110 __u16 bg_pad;
111 __le32 bg_reserved[3];
112};
113
114/*
115 * Macro-instructions used to manage group descriptors
116 */
117#define EXT3_BLOCKS_PER_GROUP(s) (EXT3_SB(s)->s_blocks_per_group)
118#define EXT3_DESC_PER_BLOCK(s) (EXT3_SB(s)->s_desc_per_block)
119#define EXT3_INODES_PER_GROUP(s) (EXT3_SB(s)->s_inodes_per_group)
120#define EXT3_DESC_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_desc_per_block_bits)
121
122/*
123 * Constants relative to the data blocks
124 */
125#define EXT3_NDIR_BLOCKS 12
126#define EXT3_IND_BLOCK EXT3_NDIR_BLOCKS
127#define EXT3_DIND_BLOCK (EXT3_IND_BLOCK + 1)
128#define EXT3_TIND_BLOCK (EXT3_DIND_BLOCK + 1)
129#define EXT3_N_BLOCKS (EXT3_TIND_BLOCK + 1)
130
131/*
132 * Inode flags
133 */
134#define EXT3_SECRM_FL 0x00000001 /* Secure deletion */
135#define EXT3_UNRM_FL 0x00000002 /* Undelete */
136#define EXT3_COMPR_FL 0x00000004 /* Compress file */
137#define EXT3_SYNC_FL 0x00000008 /* Synchronous updates */
138#define EXT3_IMMUTABLE_FL 0x00000010 /* Immutable file */
139#define EXT3_APPEND_FL 0x00000020 /* writes to file may only append */
140#define EXT3_NODUMP_FL 0x00000040 /* do not dump file */
141#define EXT3_NOATIME_FL 0x00000080 /* do not update atime */
142/* Reserved for compression usage... */
143#define EXT3_DIRTY_FL 0x00000100
144#define EXT3_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
145#define EXT3_NOCOMPR_FL 0x00000400 /* Don't compress */
146#define EXT3_ECOMPR_FL 0x00000800 /* Compression error */
147/* End compression flags --- maybe not all used */
148#define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */
149#define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */
150#define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */
151#define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */
152#define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
153#define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
154#define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */
155
156#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
157#define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
158
159/* Flags that should be inherited by new inodes from their parent. */
160#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\
161 EXT3_SYNC_FL | EXT3_NODUMP_FL |\
162 EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\
163 EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\
164 EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL)
165
166/* Flags that are appropriate for regular files (all but dir-specific ones). */
167#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL))
168
169/* Flags that are appropriate for non-directories/regular files. */
170#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL)
171
172/* Mask out flags that are inappropriate for the given type of inode. */
173static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags)
174{
175 if (S_ISDIR(mode))
176 return flags;
177 else if (S_ISREG(mode))
178 return flags & EXT3_REG_FLMASK;
179 else
180 return flags & EXT3_OTHER_FLMASK;
181}
182
183/* Used to pass group descriptor data when online resize is done */
184struct ext3_new_group_input {
185 __u32 group; /* Group number for this data */
186 __u32 block_bitmap; /* Absolute block number of block bitmap */
187 __u32 inode_bitmap; /* Absolute block number of inode bitmap */
188 __u32 inode_table; /* Absolute block number of inode table start */
189 __u32 blocks_count; /* Total number of blocks in this group */
190 __u16 reserved_blocks; /* Number of reserved blocks in this group */
191 __u16 unused;
192};
193
194/* The struct ext3_new_group_input in kernel space, with free_blocks_count */
195struct ext3_new_group_data {
196 __u32 group;
197 __u32 block_bitmap;
198 __u32 inode_bitmap;
199 __u32 inode_table;
200 __u32 blocks_count;
201 __u16 reserved_blocks;
202 __u16 unused;
203 __u32 free_blocks_count;
204};
205
206
207/*
208 * ioctl commands
209 */
210#define EXT3_IOC_GETFLAGS FS_IOC_GETFLAGS
211#define EXT3_IOC_SETFLAGS FS_IOC_SETFLAGS
212#define EXT3_IOC_GETVERSION _IOR('f', 3, long)
213#define EXT3_IOC_SETVERSION _IOW('f', 4, long)
214#define EXT3_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
215#define EXT3_IOC_GROUP_ADD _IOW('f', 8,struct ext3_new_group_input)
216#define EXT3_IOC_GETVERSION_OLD FS_IOC_GETVERSION
217#define EXT3_IOC_SETVERSION_OLD FS_IOC_SETVERSION
218#ifdef CONFIG_JBD_DEBUG
219#define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
220#endif
221#define EXT3_IOC_GETRSVSZ _IOR('f', 5, long)
222#define EXT3_IOC_SETRSVSZ _IOW('f', 6, long)
223
224/*
225 * ioctl commands in 32 bit emulation
226 */
227#define EXT3_IOC32_GETFLAGS FS_IOC32_GETFLAGS
228#define EXT3_IOC32_SETFLAGS FS_IOC32_SETFLAGS
229#define EXT3_IOC32_GETVERSION _IOR('f', 3, int)
230#define EXT3_IOC32_SETVERSION _IOW('f', 4, int)
231#define EXT3_IOC32_GETRSVSZ _IOR('f', 5, int)
232#define EXT3_IOC32_SETRSVSZ _IOW('f', 6, int)
233#define EXT3_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
234#ifdef CONFIG_JBD_DEBUG
235#define EXT3_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
236#endif
237#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
238#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
239
240/* Number of supported quota types */
241#define EXT3_MAXQUOTAS 2
242
243/*
244 * Mount options
245 */
246struct ext3_mount_options {
247 unsigned long s_mount_opt;
248 kuid_t s_resuid;
249 kgid_t s_resgid;
250 unsigned long s_commit_interval;
251#ifdef CONFIG_QUOTA
252 int s_jquota_fmt;
253 char *s_qf_names[EXT3_MAXQUOTAS];
254#endif
255};
256
257/*
258 * Structure of an inode on the disk
259 */
260struct ext3_inode {
261 __le16 i_mode; /* File mode */
262 __le16 i_uid; /* Low 16 bits of Owner Uid */
263 __le32 i_size; /* Size in bytes */
264 __le32 i_atime; /* Access time */
265 __le32 i_ctime; /* Creation time */
266 __le32 i_mtime; /* Modification time */
267 __le32 i_dtime; /* Deletion Time */
268 __le16 i_gid; /* Low 16 bits of Group Id */
269 __le16 i_links_count; /* Links count */
270 __le32 i_blocks; /* Blocks count */
271 __le32 i_flags; /* File flags */
272 union {
273 struct {
274 __u32 l_i_reserved1;
275 } linux1;
276 struct {
277 __u32 h_i_translator;
278 } hurd1;
279 struct {
280 __u32 m_i_reserved1;
281 } masix1;
282 } osd1; /* OS dependent 1 */
283 __le32 i_block[EXT3_N_BLOCKS];/* Pointers to blocks */
284 __le32 i_generation; /* File version (for NFS) */
285 __le32 i_file_acl; /* File ACL */
286 __le32 i_dir_acl; /* Directory ACL */
287 __le32 i_faddr; /* Fragment address */
288 union {
289 struct {
290 __u8 l_i_frag; /* Fragment number */
291 __u8 l_i_fsize; /* Fragment size */
292 __u16 i_pad1;
293 __le16 l_i_uid_high; /* these 2 fields */
294 __le16 l_i_gid_high; /* were reserved2[0] */
295 __u32 l_i_reserved2;
296 } linux2;
297 struct {
298 __u8 h_i_frag; /* Fragment number */
299 __u8 h_i_fsize; /* Fragment size */
300 __u16 h_i_mode_high;
301 __u16 h_i_uid_high;
302 __u16 h_i_gid_high;
303 __u32 h_i_author;
304 } hurd2;
305 struct {
306 __u8 m_i_frag; /* Fragment number */
307 __u8 m_i_fsize; /* Fragment size */
308 __u16 m_pad1;
309 __u32 m_i_reserved2[2];
310 } masix2;
311 } osd2; /* OS dependent 2 */
312 __le16 i_extra_isize;
313 __le16 i_pad1;
314};
315
316#define i_size_high i_dir_acl
317
318#define i_reserved1 osd1.linux1.l_i_reserved1
319#define i_frag osd2.linux2.l_i_frag
320#define i_fsize osd2.linux2.l_i_fsize
321#define i_uid_low i_uid
322#define i_gid_low i_gid
323#define i_uid_high osd2.linux2.l_i_uid_high
324#define i_gid_high osd2.linux2.l_i_gid_high
325#define i_reserved2 osd2.linux2.l_i_reserved2
326
327/*
328 * File system states
329 */
330#define EXT3_VALID_FS 0x0001 /* Unmounted cleanly */
331#define EXT3_ERROR_FS 0x0002 /* Errors detected */
332#define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */
333
334/*
335 * Misc. filesystem flags
336 */
337#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */
338#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */
339#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */
340
341/*
342 * Mount flags
343 */
344#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */
345/* EXT3_MOUNT_OLDALLOC was there */
346#define EXT3_MOUNT_GRPID 0x00004 /* Create files with directory's group */
347#define EXT3_MOUNT_DEBUG 0x00008 /* Some debugging messages */
348#define EXT3_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
349#define EXT3_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */
350#define EXT3_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
351#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
352#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
353#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */
354#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
355#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
356#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
357#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */
358#define EXT3_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */
359#define EXT3_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
360#define EXT3_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
361#define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
362#define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */
363#define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */
364#define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */
365#define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
366#define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
367#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write
368 * error in ordered mode */
369
370/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
371#ifndef _LINUX_EXT2_FS_H
372#define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt
373#define set_opt(o, opt) o |= EXT3_MOUNT_##opt
374#define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \
375 EXT3_MOUNT_##opt)
376#else
377#define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD
378#define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT
379#define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS
380#endif
381
382#define ext3_set_bit __set_bit_le
383#define ext3_set_bit_atomic ext2_set_bit_atomic
384#define ext3_clear_bit __clear_bit_le
385#define ext3_clear_bit_atomic ext2_clear_bit_atomic
386#define ext3_test_bit test_bit_le
387#define ext3_find_next_zero_bit find_next_zero_bit_le
388
389/*
390 * Maximal mount counts between two filesystem checks
391 */
392#define EXT3_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */
393#define EXT3_DFL_CHECKINTERVAL 0 /* Don't use interval check */
394
395/*
396 * Behaviour when detecting errors
397 */
398#define EXT3_ERRORS_CONTINUE 1 /* Continue execution */
399#define EXT3_ERRORS_RO 2 /* Remount fs read-only */
400#define EXT3_ERRORS_PANIC 3 /* Panic */
401#define EXT3_ERRORS_DEFAULT EXT3_ERRORS_CONTINUE
402
403/*
404 * Structure of the super block
405 */
406struct ext3_super_block {
407/*00*/ __le32 s_inodes_count; /* Inodes count */
408 __le32 s_blocks_count; /* Blocks count */
409 __le32 s_r_blocks_count; /* Reserved blocks count */
410 __le32 s_free_blocks_count; /* Free blocks count */
411/*10*/ __le32 s_free_inodes_count; /* Free inodes count */
412 __le32 s_first_data_block; /* First Data Block */
413 __le32 s_log_block_size; /* Block size */
414 __le32 s_log_frag_size; /* Fragment size */
415/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */
416 __le32 s_frags_per_group; /* # Fragments per group */
417 __le32 s_inodes_per_group; /* # Inodes per group */
418 __le32 s_mtime; /* Mount time */
419/*30*/ __le32 s_wtime; /* Write time */
420 __le16 s_mnt_count; /* Mount count */
421 __le16 s_max_mnt_count; /* Maximal mount count */
422 __le16 s_magic; /* Magic signature */
423 __le16 s_state; /* File system state */
424 __le16 s_errors; /* Behaviour when detecting errors */
425 __le16 s_minor_rev_level; /* minor revision level */
426/*40*/ __le32 s_lastcheck; /* time of last check */
427 __le32 s_checkinterval; /* max. time between checks */
428 __le32 s_creator_os; /* OS */
429 __le32 s_rev_level; /* Revision level */
430/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */
431 __le16 s_def_resgid; /* Default gid for reserved blocks */
432 /*
433 * These fields are for EXT3_DYNAMIC_REV superblocks only.
434 *
435 * Note: the difference between the compatible feature set and
436 * the incompatible feature set is that if there is a bit set
437 * in the incompatible feature set that the kernel doesn't
438 * know about, it should refuse to mount the filesystem.
439 *
440 * e2fsck's requirements are more strict; if it doesn't know
441 * about a feature in either the compatible or incompatible
442 * feature set, it must abort and not try to meddle with
443 * things it doesn't understand...
444 */
445 __le32 s_first_ino; /* First non-reserved inode */
446 __le16 s_inode_size; /* size of inode structure */
447 __le16 s_block_group_nr; /* block group # of this superblock */
448 __le32 s_feature_compat; /* compatible feature set */
449/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
450 __le32 s_feature_ro_compat; /* readonly-compatible feature set */
451/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
452/*78*/ char s_volume_name[16]; /* volume name */
453/*88*/ char s_last_mounted[64]; /* directory where last mounted */
454/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
455 /*
456 * Performance hints. Directory preallocation should only
457 * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on.
458 */
459 __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/
460 __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */
461 __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */
462 /*
463 * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set.
464 */
465/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */
466/*E0*/ __le32 s_journal_inum; /* inode number of journal file */
467 __le32 s_journal_dev; /* device number of journal file */
468 __le32 s_last_orphan; /* start of list of inodes to delete */
469 __le32 s_hash_seed[4]; /* HTREE hash seed */
470 __u8 s_def_hash_version; /* Default hash version to use */
471 __u8 s_reserved_char_pad;
472 __u16 s_reserved_word_pad;
473 __le32 s_default_mount_opts;
474 __le32 s_first_meta_bg; /* First metablock block group */
475 __le32 s_mkfs_time; /* When the filesystem was created */
476 __le32 s_jnl_blocks[17]; /* Backup of the journal inode */
477 /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
478/*150*/ __le32 s_blocks_count_hi; /* Blocks count */
479 __le32 s_r_blocks_count_hi; /* Reserved blocks count */
480 __le32 s_free_blocks_count_hi; /* Free blocks count */
481 __le16 s_min_extra_isize; /* All inodes have at least # bytes */
482 __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
483 __le32 s_flags; /* Miscellaneous flags */
484 __le16 s_raid_stride; /* RAID stride */
485 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
486 __le64 s_mmp_block; /* Block for multi-mount protection */
487 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
488 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
489 __u8 s_reserved_char_pad2;
490 __le16 s_reserved_pad;
491 __u32 s_reserved[162]; /* Padding to the end of the block */
492};
493
494/* data type for block offset of block group */
495typedef int ext3_grpblk_t;
496
497/* data type for filesystem-wide blocks number */
498typedef unsigned long ext3_fsblk_t;
499
500#define E3FSBLK "%lu"
501
502struct ext3_reserve_window {
503 ext3_fsblk_t _rsv_start; /* First byte reserved */
504 ext3_fsblk_t _rsv_end; /* Last byte reserved or 0 */
505};
506
507struct ext3_reserve_window_node {
508 struct rb_node rsv_node;
509 __u32 rsv_goal_size;
510 __u32 rsv_alloc_hit;
511 struct ext3_reserve_window rsv_window;
512};
513
514struct ext3_block_alloc_info {
515 /* information about reservation window */
516 struct ext3_reserve_window_node rsv_window_node;
517 /*
518 * was i_next_alloc_block in ext3_inode_info
519 * is the logical (file-relative) number of the
520 * most-recently-allocated block in this file.
521 * We use this for detecting linearly ascending allocation requests.
522 */
523 __u32 last_alloc_logical_block;
524 /*
525 * Was i_next_alloc_goal in ext3_inode_info
526 * is the *physical* companion to i_next_alloc_block.
527 * it the physical block number of the block which was most-recentl
528 * allocated to this file. This give us the goal (target) for the next
529 * allocation when we detect linearly ascending requests.
530 */
531 ext3_fsblk_t last_alloc_physical_block;
532};
533
534#define rsv_start rsv_window._rsv_start
535#define rsv_end rsv_window._rsv_end
536
537/*
538 * third extended file system inode data in memory
539 */
540struct ext3_inode_info {
541 __le32 i_data[15]; /* unconverted */
542 __u32 i_flags;
543#ifdef EXT3_FRAGMENTS
544 __u32 i_faddr;
545 __u8 i_frag_no;
546 __u8 i_frag_size;
547#endif
548 ext3_fsblk_t i_file_acl;
549 __u32 i_dir_acl;
550 __u32 i_dtime;
551
552 /*
553 * i_block_group is the number of the block group which contains
554 * this file's inode. Constant across the lifetime of the inode,
555 * it is ued for making block allocation decisions - we try to
556 * place a file's data blocks near its inode block, and new inodes
557 * near to their parent directory's inode.
558 */
559 __u32 i_block_group;
560 unsigned long i_state_flags; /* Dynamic state flags for ext3 */
561
562 /* block reservation info */
563 struct ext3_block_alloc_info *i_block_alloc_info;
564
565 __u32 i_dir_start_lookup;
566#ifdef CONFIG_EXT3_FS_XATTR
567 /*
568 * Extended attributes can be read independently of the main file
569 * data. Taking i_mutex even when reading would cause contention
570 * between readers of EAs and writers of regular file data, so
571 * instead we synchronize on xattr_sem when reading or changing
572 * EAs.
573 */
574 struct rw_semaphore xattr_sem;
575#endif
576
577 struct list_head i_orphan; /* unlinked but open inodes */
578
579 /*
580 * i_disksize keeps track of what the inode size is ON DISK, not
581 * in memory. During truncate, i_size is set to the new size by
582 * the VFS prior to calling ext3_truncate(), but the filesystem won't
583 * set i_disksize to 0 until the truncate is actually under way.
584 *
585 * The intent is that i_disksize always represents the blocks which
586 * are used by this file. This allows recovery to restart truncate
587 * on orphans if we crash during truncate. We actually write i_disksize
588 * into the on-disk inode when writing inodes out, instead of i_size.
589 *
590 * The only time when i_disksize and i_size may be different is when
591 * a truncate is in progress. The only things which change i_disksize
592 * are ext3_get_block (growth) and ext3_truncate (shrinkth).
593 */
594 loff_t i_disksize;
595
596 /* on-disk additional length */
597 __u16 i_extra_isize;
598
599 /*
600 * truncate_mutex is for serialising ext3_truncate() against
601 * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's
602 * data tree are chopped off during truncate. We can't do that in
603 * ext3 because whenever we perform intermediate commits during
604 * truncate, the inode and all the metadata blocks *must* be in a
605 * consistent state which allows truncation of the orphans to restart
606 * during recovery. Hence we must fix the get_block-vs-truncate race
607 * by other means, so we have truncate_mutex.
608 */
609 struct mutex truncate_mutex;
610
611 /*
612 * Transactions that contain inode's metadata needed to complete
613 * fsync and fdatasync, respectively.
614 */
615 atomic_t i_sync_tid;
616 atomic_t i_datasync_tid;
617
618#ifdef CONFIG_QUOTA
619 struct dquot *i_dquot[MAXQUOTAS];
620#endif
621
622 struct inode vfs_inode;
623};
624
625/*
626 * third extended-fs super-block data in memory
627 */
628struct ext3_sb_info {
629 unsigned long s_frag_size; /* Size of a fragment in bytes */
630 unsigned long s_frags_per_block;/* Number of fragments per block */
631 unsigned long s_inodes_per_block;/* Number of inodes per block */
632 unsigned long s_frags_per_group;/* Number of fragments in a group */
633 unsigned long s_blocks_per_group;/* Number of blocks in a group */
634 unsigned long s_inodes_per_group;/* Number of inodes in a group */
635 unsigned long s_itb_per_group; /* Number of inode table blocks per group */
636 unsigned long s_gdb_count; /* Number of group descriptor blocks */
637 unsigned long s_desc_per_block; /* Number of group descriptors per block */
638 unsigned long s_groups_count; /* Number of groups in the fs */
639 unsigned long s_overhead_last; /* Last calculated overhead */
640 unsigned long s_blocks_last; /* Last seen block count */
641 struct buffer_head * s_sbh; /* Buffer containing the super block */
642 struct ext3_super_block * s_es; /* Pointer to the super block in the buffer */
643 struct buffer_head ** s_group_desc;
644 unsigned long s_mount_opt;
645 ext3_fsblk_t s_sb_block;
646 kuid_t s_resuid;
647 kgid_t s_resgid;
648 unsigned short s_mount_state;
649 unsigned short s_pad;
650 int s_addr_per_block_bits;
651 int s_desc_per_block_bits;
652 int s_inode_size;
653 int s_first_ino;
654 spinlock_t s_next_gen_lock;
655 u32 s_next_generation;
656 u32 s_hash_seed[4];
657 int s_def_hash_version;
658 int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
659 struct percpu_counter s_freeblocks_counter;
660 struct percpu_counter s_freeinodes_counter;
661 struct percpu_counter s_dirs_counter;
662 struct blockgroup_lock *s_blockgroup_lock;
663
664 /* root of the per fs reservation window tree */
665 spinlock_t s_rsv_window_lock;
666 struct rb_root s_rsv_window_root;
667 struct ext3_reserve_window_node s_rsv_window_head;
668
669 /* Journaling */
670 struct inode * s_journal_inode;
671 struct journal_s * s_journal;
672 struct list_head s_orphan;
673 struct mutex s_orphan_lock;
674 struct mutex s_resize_lock;
675 unsigned long s_commit_interval;
676 struct block_device *journal_bdev;
677#ifdef CONFIG_QUOTA
678 char *s_qf_names[EXT3_MAXQUOTAS]; /* Names of quota files with journalled quota */
679 int s_jquota_fmt; /* Format of quota to use */
680#endif
681};
682
683static inline spinlock_t *
684sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group)
685{
686 return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
687}
688
689static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb)
690{
691 return sb->s_fs_info;
692}
693static inline struct ext3_inode_info *EXT3_I(struct inode *inode)
694{
695 return container_of(inode, struct ext3_inode_info, vfs_inode);
696}
697
698static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino)
699{
700 return ino == EXT3_ROOT_INO ||
701 ino == EXT3_JOURNAL_INO ||
702 ino == EXT3_RESIZE_INO ||
703 (ino >= EXT3_FIRST_INO(sb) &&
704 ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count));
705}
706
707/*
708 * Inode dynamic state flags
709 */
710enum {
711 EXT3_STATE_JDATA, /* journaled data exists */
712 EXT3_STATE_NEW, /* inode is newly created */
713 EXT3_STATE_XATTR, /* has in-inode xattrs */
714 EXT3_STATE_FLUSH_ON_CLOSE, /* flush dirty pages on close */
715};
716
717static inline int ext3_test_inode_state(struct inode *inode, int bit)
718{
719 return test_bit(bit, &EXT3_I(inode)->i_state_flags);
720}
721
722static inline void ext3_set_inode_state(struct inode *inode, int bit)
723{
724 set_bit(bit, &EXT3_I(inode)->i_state_flags);
725}
726
727static inline void ext3_clear_inode_state(struct inode *inode, int bit)
728{
729 clear_bit(bit, &EXT3_I(inode)->i_state_flags);
730}
731
732#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime
733
734/*
735 * Codes for operating systems
736 */
737#define EXT3_OS_LINUX 0
738#define EXT3_OS_HURD 1
739#define EXT3_OS_MASIX 2
740#define EXT3_OS_FREEBSD 3
741#define EXT3_OS_LITES 4
742
743/*
744 * Revision levels
745 */
746#define EXT3_GOOD_OLD_REV 0 /* The good old (original) format */
747#define EXT3_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */
748
749#define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV
750#define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV
751
752#define EXT3_GOOD_OLD_INODE_SIZE 128
753
754/*
755 * Feature set definitions
756 */
757
758#define EXT3_HAS_COMPAT_FEATURE(sb,mask) \
759 ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
760#define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask) \
761 ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
762#define EXT3_HAS_INCOMPAT_FEATURE(sb,mask) \
763 ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
764#define EXT3_SET_COMPAT_FEATURE(sb,mask) \
765 EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
766#define EXT3_SET_RO_COMPAT_FEATURE(sb,mask) \
767 EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
768#define EXT3_SET_INCOMPAT_FEATURE(sb,mask) \
769 EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
770#define EXT3_CLEAR_COMPAT_FEATURE(sb,mask) \
771 EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
772#define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
773 EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
774#define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask) \
775 EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
776
777#define EXT3_FEATURE_COMPAT_DIR_PREALLOC 0x0001
778#define EXT3_FEATURE_COMPAT_IMAGIC_INODES 0x0002
779#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004
780#define EXT3_FEATURE_COMPAT_EXT_ATTR 0x0008
781#define EXT3_FEATURE_COMPAT_RESIZE_INODE 0x0010
782#define EXT3_FEATURE_COMPAT_DIR_INDEX 0x0020
783
784#define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
785#define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
786#define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004
787
788#define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001
789#define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002
790#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
791#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
792#define EXT3_FEATURE_INCOMPAT_META_BG 0x0010
793
794#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
795#define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \
796 EXT3_FEATURE_INCOMPAT_RECOVER| \
797 EXT3_FEATURE_INCOMPAT_META_BG)
798#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
799 EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
800 EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
801
802/*
803 * Default values for user and/or group using reserved blocks
804 */
805#define EXT3_DEF_RESUID 0
806#define EXT3_DEF_RESGID 0
807
808/*
809 * Default mount options
810 */
811#define EXT3_DEFM_DEBUG 0x0001
812#define EXT3_DEFM_BSDGROUPS 0x0002
813#define EXT3_DEFM_XATTR_USER 0x0004
814#define EXT3_DEFM_ACL 0x0008
815#define EXT3_DEFM_UID16 0x0010
816#define EXT3_DEFM_JMODE 0x0060
817#define EXT3_DEFM_JMODE_DATA 0x0020
818#define EXT3_DEFM_JMODE_ORDERED 0x0040
819#define EXT3_DEFM_JMODE_WBACK 0x0060
820
821/*
822 * Structure of a directory entry
823 */
824#define EXT3_NAME_LEN 255
825
826struct ext3_dir_entry {
827 __le32 inode; /* Inode number */
828 __le16 rec_len; /* Directory entry length */
829 __le16 name_len; /* Name length */
830 char name[EXT3_NAME_LEN]; /* File name */
831};
832
833/*
834 * The new version of the directory entry. Since EXT3 structures are
835 * stored in intel byte order, and the name_len field could never be
836 * bigger than 255 chars, it's safe to reclaim the extra byte for the
837 * file_type field.
838 */
839struct ext3_dir_entry_2 {
840 __le32 inode; /* Inode number */
841 __le16 rec_len; /* Directory entry length */
842 __u8 name_len; /* Name length */
843 __u8 file_type;
844 char name[EXT3_NAME_LEN]; /* File name */
845};
846
847/*
848 * Ext3 directory file types. Only the low 3 bits are used. The
849 * other bits are reserved for now.
850 */
851#define EXT3_FT_UNKNOWN 0
852#define EXT3_FT_REG_FILE 1
853#define EXT3_FT_DIR 2
854#define EXT3_FT_CHRDEV 3
855#define EXT3_FT_BLKDEV 4
856#define EXT3_FT_FIFO 5
857#define EXT3_FT_SOCK 6
858#define EXT3_FT_SYMLINK 7
859
860#define EXT3_FT_MAX 8
861
862/*
863 * EXT3_DIR_PAD defines the directory entries boundaries
864 *
865 * NOTE: It must be a multiple of 4
866 */
867#define EXT3_DIR_PAD 4
868#define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1)
869#define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \
870 ~EXT3_DIR_ROUND)
871#define EXT3_MAX_REC_LEN ((1<<16)-1)
872
873/*
874 * Tests against MAX_REC_LEN etc were put in place for 64k block
875 * sizes; if that is not possible on this arch, we can skip
876 * those tests and speed things up.
877 */
878static inline unsigned ext3_rec_len_from_disk(__le16 dlen)
879{
880 unsigned len = le16_to_cpu(dlen);
881
882#if (PAGE_CACHE_SIZE >= 65536)
883 if (len == EXT3_MAX_REC_LEN)
884 return 1 << 16;
885#endif
886 return len;
887}
888
889static inline __le16 ext3_rec_len_to_disk(unsigned len)
890{
891#if (PAGE_CACHE_SIZE >= 65536)
892 if (len == (1 << 16))
893 return cpu_to_le16(EXT3_MAX_REC_LEN);
894 else if (len > (1 << 16))
895 BUG();
896#endif
897 return cpu_to_le16(len);
898}
899
900/*
901 * Hash Tree Directory indexing
902 * (c) Daniel Phillips, 2001
903 */
904
905#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
906 EXT3_FEATURE_COMPAT_DIR_INDEX) && \
907 (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
908#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
909#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
910
911/* Legal values for the dx_root hash_version field: */
912
913#define DX_HASH_LEGACY 0
914#define DX_HASH_HALF_MD4 1
915#define DX_HASH_TEA 2
916#define DX_HASH_LEGACY_UNSIGNED 3
917#define DX_HASH_HALF_MD4_UNSIGNED 4
918#define DX_HASH_TEA_UNSIGNED 5
919
920/* hash info structure used by the directory hash */
921struct dx_hash_info
922{
923 u32 hash;
924 u32 minor_hash;
925 int hash_version;
926 u32 *seed;
927};
928
929
930/* 32 and 64 bit signed EOF for dx directories */
931#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1)
932#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1)
933
934
935/*
936 * Control parameters used by ext3_htree_next_block
937 */
938#define HASH_NB_ALWAYS 1
939
940
941/*
942 * Describe an inode's exact location on disk and in memory
943 */
944struct ext3_iloc
945{
946 struct buffer_head *bh;
947 unsigned long offset;
948 unsigned long block_group;
949};
950
951static inline struct ext3_inode *ext3_raw_inode(struct ext3_iloc *iloc)
952{
953 return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset);
954}
955
956/*
957 * This structure is stuffed into the struct file's private_data field
958 * for directories. It is where we put information so that we can do
959 * readdir operations in hash tree order.
960 */
961struct dir_private_info {
962 struct rb_root root;
963 struct rb_node *curr_node;
964 struct fname *extra_fname;
965 loff_t last_pos;
966 __u32 curr_hash;
967 __u32 curr_minor_hash;
968 __u32 next_hash;
969};
970
971/* calculate the first block number of the group */
972static inline ext3_fsblk_t
973ext3_group_first_block_no(struct super_block *sb, unsigned long group_no)
974{
975 return group_no * (ext3_fsblk_t)EXT3_BLOCKS_PER_GROUP(sb) +
976 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
977}
978
979/*
980 * Special error return code only used by dx_probe() and its callers.
981 */
982#define ERR_BAD_DX_DIR -75000
983
984/*
985 * Function prototypes
986 */
987
988/*
989 * Ok, these declarations are also in <linux/kernel.h> but none of the
990 * ext3 source programs needs to include it so they are duplicated here.
991 */
992# define NORET_TYPE /**/
993# define ATTRIB_NORET __attribute__((noreturn))
994# define NORET_AND noreturn,
995
996/* balloc.c */
997extern int ext3_bg_has_super(struct super_block *sb, int group);
998extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
999extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
1000 ext3_fsblk_t goal, int *errp);
1001extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
1002 ext3_fsblk_t goal, unsigned long *count, int *errp);
1003extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
1004 ext3_fsblk_t block, unsigned long count);
1005extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
1006 ext3_fsblk_t block, unsigned long count,
1007 unsigned long *pdquot_freed_blocks);
1008extern ext3_fsblk_t ext3_count_free_blocks (struct super_block *);
1009extern void ext3_check_blocks_bitmap (struct super_block *);
1010extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
1011 unsigned int block_group,
1012 struct buffer_head ** bh);
1013extern int ext3_should_retry_alloc(struct super_block *sb, int *retries);
1014extern void ext3_init_block_alloc_info(struct inode *);
1015extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv);
1016extern int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range);
1017
1018/* dir.c */
1019extern int ext3_check_dir_entry(const char *, struct inode *,
1020 struct ext3_dir_entry_2 *,
1021 struct buffer_head *, unsigned long);
1022extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
1023 __u32 minor_hash,
1024 struct ext3_dir_entry_2 *dirent);
1025extern void ext3_htree_free_dir_info(struct dir_private_info *p);
1026
1027/* fsync.c */
1028extern int ext3_sync_file(struct file *, loff_t, loff_t, int);
1029
1030/* hash.c */
1031extern int ext3fs_dirhash(const char *name, int len, struct
1032 dx_hash_info *hinfo);
1033
1034/* ialloc.c */
1035extern struct inode * ext3_new_inode (handle_t *, struct inode *,
1036 const struct qstr *, umode_t);
1037extern void ext3_free_inode (handle_t *, struct inode *);
1038extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
1039extern unsigned long ext3_count_free_inodes (struct super_block *);
1040extern unsigned long ext3_count_dirs (struct super_block *);
1041extern void ext3_check_inodes_bitmap (struct super_block *);
1042extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
1043
1044
1045/* inode.c */
1046int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
1047 struct buffer_head *bh, ext3_fsblk_t blocknr);
1048struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
1049struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
1050int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
1051 sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result,
1052 int create);
1053
1054extern struct inode *ext3_iget(struct super_block *, unsigned long);
1055extern int ext3_write_inode (struct inode *, struct writeback_control *);
1056extern int ext3_setattr (struct dentry *, struct iattr *);
1057extern void ext3_evict_inode (struct inode *);
1058extern int ext3_sync_inode (handle_t *, struct inode *);
1059extern void ext3_discard_reservation (struct inode *);
1060extern void ext3_dirty_inode(struct inode *, int);
1061extern int ext3_change_inode_journal_flag(struct inode *, int);
1062extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *);
1063extern int ext3_can_truncate(struct inode *inode);
1064extern void ext3_truncate(struct inode *inode);
1065extern void ext3_set_inode_flags(struct inode *);
1066extern void ext3_get_inode_flags(struct ext3_inode_info *);
1067extern void ext3_set_aops(struct inode *inode);
1068extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1069 u64 start, u64 len);
1070
1071/* ioctl.c */
1072extern long ext3_ioctl(struct file *, unsigned int, unsigned long);
1073extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long);
1074
1075/* namei.c */
1076extern int ext3_orphan_add(handle_t *, struct inode *);
1077extern int ext3_orphan_del(handle_t *, struct inode *);
1078extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
1079 __u32 start_minor_hash, __u32 *next_hash);
1080
1081/* resize.c */
1082extern int ext3_group_add(struct super_block *sb,
1083 struct ext3_new_group_data *input);
1084extern int ext3_group_extend(struct super_block *sb,
1085 struct ext3_super_block *es,
1086 ext3_fsblk_t n_blocks_count);
1087
1088/* super.c */
1089extern __printf(3, 4)
1090void ext3_error(struct super_block *, const char *, const char *, ...);
1091extern void __ext3_std_error (struct super_block *, const char *, int);
1092extern __printf(3, 4)
1093void ext3_abort(struct super_block *, const char *, const char *, ...);
1094extern __printf(3, 4)
1095void ext3_warning(struct super_block *, const char *, const char *, ...);
1096extern __printf(3, 4)
1097void ext3_msg(struct super_block *, const char *, const char *, ...);
1098extern void ext3_update_dynamic_rev (struct super_block *sb);
1099
1100#define ext3_std_error(sb, errno) \
1101do { \
1102 if ((errno)) \
1103 __ext3_std_error((sb), __func__, (errno)); \
1104} while (0)
1105
1106/*
1107 * Inodes and files operations
1108 */
1109
1110/* dir.c */
1111extern const struct file_operations ext3_dir_operations;
1112
1113/* file.c */
1114extern const struct inode_operations ext3_file_inode_operations;
1115extern const struct file_operations ext3_file_operations;
1116
1117/* namei.c */
1118extern const struct inode_operations ext3_dir_inode_operations;
1119extern const struct inode_operations ext3_special_inode_operations;
1120
1121/* symlink.c */
1122extern const struct inode_operations ext3_symlink_inode_operations;
1123extern const struct inode_operations ext3_fast_symlink_inode_operations;
1124
1125#define EXT3_JOURNAL(inode) (EXT3_SB((inode)->i_sb)->s_journal)
1126
1127/* Define the number of blocks we need to account to a transaction to
1128 * modify one block of data.
1129 *
1130 * We may have to touch one inode, one bitmap buffer, up to three
1131 * indirection blocks, the group and superblock summaries, and the data
1132 * block to complete the transaction. */
1133
1134#define EXT3_SINGLEDATA_TRANS_BLOCKS 8U
1135
1136/* Extended attribute operations touch at most two data buffers,
1137 * two bitmap buffers, and two group summaries, in addition to the inode
1138 * and the superblock, which are already accounted for. */
1139
1140#define EXT3_XATTR_TRANS_BLOCKS 6U
1141
1142/* Define the minimum size for a transaction which modifies data. This
1143 * needs to take into account the fact that we may end up modifying two
1144 * quota files too (one for the group, one for the user quota). The
1145 * superblock only gets updated once, of course, so don't bother
1146 * counting that again for the quota updates. */
1147
1148#define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS + \
1149 EXT3_XATTR_TRANS_BLOCKS - 2 + \
1150 EXT3_MAXQUOTAS_TRANS_BLOCKS(sb))
1151
1152/* Delete operations potentially hit one directory's namespace plus an
1153 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
1154 * generous. We can grow the delete transaction later if necessary. */
1155
1156#define EXT3_DELETE_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) + 64)
1157
1158/* Define an arbitrary limit for the amount of data we will anticipate
1159 * writing to any given transaction. For unbounded transactions such as
1160 * write(2) and truncate(2) we can write more than this, but we always
1161 * start off at the maximum transaction size and grow the transaction
1162 * optimistically as we go. */
1163
1164#define EXT3_MAX_TRANS_DATA 64U
1165
1166/* We break up a large truncate or write transaction once the handle's
1167 * buffer credits gets this low, we need either to extend the
1168 * transaction or to start a new one. Reserve enough space here for
1169 * inode, bitmap, superblock, group and indirection updates for at least
1170 * one block, plus two quota updates. Quota allocations are not
1171 * needed. */
1172
1173#define EXT3_RESERVE_TRANS_BLOCKS 12U
1174
1175#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8
1176
1177#ifdef CONFIG_QUOTA
1178/* Amount of blocks needed for quota update - we know that the structure was
1179 * allocated so we need to update only inode+data */
1180#define EXT3_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
1181/* Amount of blocks needed for quota insert/delete - we do some block writes
1182 * but inode, sb and group updates are done only once */
1183#define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
1184 (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0)
1185#define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
1186 (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0)
1187#else
1188#define EXT3_QUOTA_TRANS_BLOCKS(sb) 0
1189#define EXT3_QUOTA_INIT_BLOCKS(sb) 0
1190#define EXT3_QUOTA_DEL_BLOCKS(sb) 0
1191#endif
1192#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb))
1193#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb))
1194#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb))
1195
1196int
1197ext3_mark_iloc_dirty(handle_t *handle,
1198 struct inode *inode,
1199 struct ext3_iloc *iloc);
1200
1201/*
1202 * On success, We end up with an outstanding reference count against
1203 * iloc->bh. This _must_ be cleaned up later.
1204 */
1205
1206int ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
1207 struct ext3_iloc *iloc);
1208
1209int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode);
1210
1211/*
1212 * Wrapper functions with which ext3 calls into JBD. The intent here is
1213 * to allow these to be turned into appropriate stubs so ext3 can control
1214 * ext2 filesystems, so ext2+ext3 systems only nee one fs. This work hasn't
1215 * been done yet.
1216 */
1217
1218static inline void ext3_journal_release_buffer(handle_t *handle,
1219 struct buffer_head *bh)
1220{
1221 journal_release_buffer(handle, bh);
1222}
1223
1224void ext3_journal_abort_handle(const char *caller, const char *err_fn,
1225 struct buffer_head *bh, handle_t *handle, int err);
1226
1227int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
1228 struct buffer_head *bh);
1229
1230int __ext3_journal_get_write_access(const char *where, handle_t *handle,
1231 struct buffer_head *bh);
1232
1233int __ext3_journal_forget(const char *where, handle_t *handle,
1234 struct buffer_head *bh);
1235
1236int __ext3_journal_revoke(const char *where, handle_t *handle,
1237 unsigned long blocknr, struct buffer_head *bh);
1238
1239int __ext3_journal_get_create_access(const char *where,
1240 handle_t *handle, struct buffer_head *bh);
1241
1242int __ext3_journal_dirty_metadata(const char *where,
1243 handle_t *handle, struct buffer_head *bh);
1244
1245#define ext3_journal_get_undo_access(handle, bh) \
1246 __ext3_journal_get_undo_access(__func__, (handle), (bh))
1247#define ext3_journal_get_write_access(handle, bh) \
1248 __ext3_journal_get_write_access(__func__, (handle), (bh))
1249#define ext3_journal_revoke(handle, blocknr, bh) \
1250 __ext3_journal_revoke(__func__, (handle), (blocknr), (bh))
1251#define ext3_journal_get_create_access(handle, bh) \
1252 __ext3_journal_get_create_access(__func__, (handle), (bh))
1253#define ext3_journal_dirty_metadata(handle, bh) \
1254 __ext3_journal_dirty_metadata(__func__, (handle), (bh))
1255#define ext3_journal_forget(handle, bh) \
1256 __ext3_journal_forget(__func__, (handle), (bh))
1257
1258int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
1259
1260handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks);
1261int __ext3_journal_stop(const char *where, handle_t *handle);
1262
1263static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks)
1264{
1265 return ext3_journal_start_sb(inode->i_sb, nblocks);
1266}
1267
1268#define ext3_journal_stop(handle) \
1269 __ext3_journal_stop(__func__, (handle))
1270
1271static inline handle_t *ext3_journal_current_handle(void)
1272{
1273 return journal_current_handle();
1274}
1275
1276static inline int ext3_journal_extend(handle_t *handle, int nblocks)
1277{
1278 return journal_extend(handle, nblocks);
1279}
1280
1281static inline int ext3_journal_restart(handle_t *handle, int nblocks)
1282{
1283 return journal_restart(handle, nblocks);
1284}
1285
1286static inline int ext3_journal_blocks_per_page(struct inode *inode)
1287{
1288 return journal_blocks_per_page(inode);
1289}
1290
1291static inline int ext3_journal_force_commit(journal_t *journal)
1292{
1293 return journal_force_commit(journal);
1294}
1295
1296/* super.c */
1297int ext3_force_commit(struct super_block *sb);
1298
1299static inline int ext3_should_journal_data(struct inode *inode)
1300{
1301 if (!S_ISREG(inode->i_mode))
1302 return 1;
1303 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
1304 return 1;
1305 if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
1306 return 1;
1307 return 0;
1308}
1309
1310static inline int ext3_should_order_data(struct inode *inode)
1311{
1312 if (!S_ISREG(inode->i_mode))
1313 return 0;
1314 if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
1315 return 0;
1316 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
1317 return 1;
1318 return 0;
1319}
1320
1321static inline int ext3_should_writeback_data(struct inode *inode)
1322{
1323 if (!S_ISREG(inode->i_mode))
1324 return 0;
1325 if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
1326 return 0;
1327 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
1328 return 1;
1329 return 0;
1330}
1331
1332#include <trace/events/ext3.h>
diff --git a/fs/ext3/ext3_jbd.c b/fs/ext3/ext3_jbd.c
deleted file mode 100644
index 785a3261a26c..000000000000
--- a/fs/ext3/ext3_jbd.c
+++ /dev/null
@@ -1,59 +0,0 @@
1/*
2 * Interface between ext3 and JBD
3 */
4
5#include "ext3.h"
6
7int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
8 struct buffer_head *bh)
9{
10 int err = journal_get_undo_access(handle, bh);
11 if (err)
12 ext3_journal_abort_handle(where, __func__, bh, handle,err);
13 return err;
14}
15
16int __ext3_journal_get_write_access(const char *where, handle_t *handle,
17 struct buffer_head *bh)
18{
19 int err = journal_get_write_access(handle, bh);
20 if (err)
21 ext3_journal_abort_handle(where, __func__, bh, handle,err);
22 return err;
23}
24
25int __ext3_journal_forget(const char *where, handle_t *handle,
26 struct buffer_head *bh)
27{
28 int err = journal_forget(handle, bh);
29 if (err)
30 ext3_journal_abort_handle(where, __func__, bh, handle,err);
31 return err;
32}
33
34int __ext3_journal_revoke(const char *where, handle_t *handle,
35 unsigned long blocknr, struct buffer_head *bh)
36{
37 int err = journal_revoke(handle, blocknr, bh);
38 if (err)
39 ext3_journal_abort_handle(where, __func__, bh, handle,err);
40 return err;
41}
42
43int __ext3_journal_get_create_access(const char *where,
44 handle_t *handle, struct buffer_head *bh)
45{
46 int err = journal_get_create_access(handle, bh);
47 if (err)
48 ext3_journal_abort_handle(where, __func__, bh, handle,err);
49 return err;
50}
51
52int __ext3_journal_dirty_metadata(const char *where,
53 handle_t *handle, struct buffer_head *bh)
54{
55 int err = journal_dirty_metadata(handle, bh);
56 if (err)
57 ext3_journal_abort_handle(where, __func__, bh, handle,err);
58 return err;
59}
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
deleted file mode 100644
index 3b8f650de22c..000000000000
--- a/fs/ext3/file.c
+++ /dev/null
@@ -1,79 +0,0 @@
1/*
2 * linux/fs/ext3/file.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/file.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * ext3 fs regular file handling primitives
16 *
17 * 64-bit file support on 64-bit platforms by Jakub Jelinek
18 * (jj@sunsite.ms.mff.cuni.cz)
19 */
20
21#include <linux/quotaops.h>
22#include "ext3.h"
23#include "xattr.h"
24#include "acl.h"
25
26/*
27 * Called when an inode is released. Note that this is different
28 * from ext3_file_open: open gets called at every open, but release
29 * gets called only when /all/ the files are closed.
30 */
31static int ext3_release_file (struct inode * inode, struct file * filp)
32{
33 if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) {
34 filemap_flush(inode->i_mapping);
35 ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
36 }
37 /* if we are the last writer on the inode, drop the block reservation */
38 if ((filp->f_mode & FMODE_WRITE) &&
39 (atomic_read(&inode->i_writecount) == 1))
40 {
41 mutex_lock(&EXT3_I(inode)->truncate_mutex);
42 ext3_discard_reservation(inode);
43 mutex_unlock(&EXT3_I(inode)->truncate_mutex);
44 }
45 if (is_dx(inode) && filp->private_data)
46 ext3_htree_free_dir_info(filp->private_data);
47
48 return 0;
49}
50
51const struct file_operations ext3_file_operations = {
52 .llseek = generic_file_llseek,
53 .read_iter = generic_file_read_iter,
54 .write_iter = generic_file_write_iter,
55 .unlocked_ioctl = ext3_ioctl,
56#ifdef CONFIG_COMPAT
57 .compat_ioctl = ext3_compat_ioctl,
58#endif
59 .mmap = generic_file_mmap,
60 .open = dquot_file_open,
61 .release = ext3_release_file,
62 .fsync = ext3_sync_file,
63 .splice_read = generic_file_splice_read,
64 .splice_write = iter_file_splice_write,
65};
66
67const struct inode_operations ext3_file_inode_operations = {
68 .setattr = ext3_setattr,
69#ifdef CONFIG_EXT3_FS_XATTR
70 .setxattr = generic_setxattr,
71 .getxattr = generic_getxattr,
72 .listxattr = ext3_listxattr,
73 .removexattr = generic_removexattr,
74#endif
75 .get_acl = ext3_get_acl,
76 .set_acl = ext3_set_acl,
77 .fiemap = ext3_fiemap,
78};
79
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
deleted file mode 100644
index 1cb9c7e10c6f..000000000000
--- a/fs/ext3/fsync.c
+++ /dev/null
@@ -1,109 +0,0 @@
1/*
2 * linux/fs/ext3/fsync.c
3 *
4 * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com)
5 * from
6 * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
7 * Laboratoire MASI - Institut Blaise Pascal
8 * Universite Pierre et Marie Curie (Paris VI)
9 * from
10 * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
11 *
12 * ext3fs fsync primitive
13 *
14 * Big-endian to little-endian byte-swapping/bitmaps by
15 * David S. Miller (davem@caip.rutgers.edu), 1995
16 *
17 * Removed unnecessary code duplication for little endian machines
18 * and excessive __inline__s.
19 * Andi Kleen, 1997
20 *
21 * Major simplications and cleanup - we only need to do the metadata, because
22 * we can depend on generic_block_fdatasync() to sync the data blocks.
23 */
24
25#include <linux/blkdev.h>
26#include <linux/writeback.h>
27#include "ext3.h"
28
29/*
30 * akpm: A new design for ext3_sync_file().
31 *
32 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
33 * There cannot be a transaction open by this task.
34 * Another task could have dirtied this inode. Its data can be in any
35 * state in the journalling system.
36 *
37 * What we do is just kick off a commit and wait on it. This will snapshot the
38 * inode to disk.
39 */
40
41int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
42{
43 struct inode *inode = file->f_mapping->host;
44 struct ext3_inode_info *ei = EXT3_I(inode);
45 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
46 int ret, needs_barrier = 0;
47 tid_t commit_tid;
48
49 trace_ext3_sync_file_enter(file, datasync);
50
51 if (inode->i_sb->s_flags & MS_RDONLY) {
52 /* Make sure that we read updated state */
53 smp_rmb();
54 if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
55 return -EROFS;
56 return 0;
57 }
58 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
59 if (ret)
60 goto out;
61
62 J_ASSERT(ext3_journal_current_handle() == NULL);
63
64 /*
65 * data=writeback,ordered:
66 * The caller's filemap_fdatawrite()/wait will sync the data.
67 * Metadata is in the journal, we wait for a proper transaction
68 * to commit here.
69 *
70 * data=journal:
71 * filemap_fdatawrite won't do anything (the buffers are clean).
72 * ext3_force_commit will write the file data into the journal and
73 * will wait on that.
74 * filemap_fdatawait() will encounter a ton of newly-dirtied pages
75 * (they were dirtied by commit). But that's OK - the blocks are
76 * safe in-journal, which is all fsync() needs to ensure.
77 */
78 if (ext3_should_journal_data(inode)) {
79 ret = ext3_force_commit(inode->i_sb);
80 goto out;
81 }
82
83 if (datasync)
84 commit_tid = atomic_read(&ei->i_datasync_tid);
85 else
86 commit_tid = atomic_read(&ei->i_sync_tid);
87
88 if (test_opt(inode->i_sb, BARRIER) &&
89 !journal_trans_will_send_data_barrier(journal, commit_tid))
90 needs_barrier = 1;
91 log_start_commit(journal, commit_tid);
92 ret = log_wait_commit(journal, commit_tid);
93
94 /*
95 * In case we didn't commit a transaction, we have to flush
96 * disk caches manually so that data really is on persistent
97 * storage
98 */
99 if (needs_barrier) {
100 int err;
101
102 err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
103 if (!ret)
104 ret = err;
105 }
106out:
107 trace_ext3_sync_file_exit(inode, ret);
108 return ret;
109}
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
deleted file mode 100644
index ede315cdf126..000000000000
--- a/fs/ext3/hash.c
+++ /dev/null
@@ -1,206 +0,0 @@
1/*
2 * linux/fs/ext3/hash.c
3 *
4 * Copyright (C) 2002 by Theodore Ts'o
5 *
6 * This file is released under the GPL v2.
7 *
8 * This file may be redistributed under the terms of the GNU Public
9 * License.
10 */
11
12#include "ext3.h"
13#include <linux/cryptohash.h>
14
15#define DELTA 0x9E3779B9
16
17static void TEA_transform(__u32 buf[4], __u32 const in[])
18{
19 __u32 sum = 0;
20 __u32 b0 = buf[0], b1 = buf[1];
21 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
22 int n = 16;
23
24 do {
25 sum += DELTA;
26 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
27 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
28 } while(--n);
29
30 buf[0] += b0;
31 buf[1] += b1;
32}
33
34
35/* The old legacy hash */
36static __u32 dx_hack_hash_unsigned(const char *name, int len)
37{
38 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
39 const unsigned char *ucp = (const unsigned char *) name;
40
41 while (len--) {
42 hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
43
44 if (hash & 0x80000000)
45 hash -= 0x7fffffff;
46 hash1 = hash0;
47 hash0 = hash;
48 }
49 return hash0 << 1;
50}
51
52static __u32 dx_hack_hash_signed(const char *name, int len)
53{
54 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
55 const signed char *scp = (const signed char *) name;
56
57 while (len--) {
58 hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
59
60 if (hash & 0x80000000)
61 hash -= 0x7fffffff;
62 hash1 = hash0;
63 hash0 = hash;
64 }
65 return hash0 << 1;
66}
67
68static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
69{
70 __u32 pad, val;
71 int i;
72 const signed char *scp = (const signed char *) msg;
73
74 pad = (__u32)len | ((__u32)len << 8);
75 pad |= pad << 16;
76
77 val = pad;
78 if (len > num*4)
79 len = num * 4;
80 for (i = 0; i < len; i++) {
81 if ((i % 4) == 0)
82 val = pad;
83 val = ((int) scp[i]) + (val << 8);
84 if ((i % 4) == 3) {
85 *buf++ = val;
86 val = pad;
87 num--;
88 }
89 }
90 if (--num >= 0)
91 *buf++ = val;
92 while (--num >= 0)
93 *buf++ = pad;
94}
95
96static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
97{
98 __u32 pad, val;
99 int i;
100 const unsigned char *ucp = (const unsigned char *) msg;
101
102 pad = (__u32)len | ((__u32)len << 8);
103 pad |= pad << 16;
104
105 val = pad;
106 if (len > num*4)
107 len = num * 4;
108 for (i=0; i < len; i++) {
109 if ((i % 4) == 0)
110 val = pad;
111 val = ((int) ucp[i]) + (val << 8);
112 if ((i % 4) == 3) {
113 *buf++ = val;
114 val = pad;
115 num--;
116 }
117 }
118 if (--num >= 0)
119 *buf++ = val;
120 while (--num >= 0)
121 *buf++ = pad;
122}
123
124/*
125 * Returns the hash of a filename. If len is 0 and name is NULL, then
126 * this function can be used to test whether or not a hash version is
127 * supported.
128 *
129 * The seed is an 4 longword (32 bits) "secret" which can be used to
130 * uniquify a hash. If the seed is all zero's, then some default seed
131 * may be used.
132 *
133 * A particular hash version specifies whether or not the seed is
134 * represented, and whether or not the returned hash is 32 bits or 64
135 * bits. 32 bit hashes will return 0 for the minor hash.
136 */
137int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
138{
139 __u32 hash;
140 __u32 minor_hash = 0;
141 const char *p;
142 int i;
143 __u32 in[8], buf[4];
144 void (*str2hashbuf)(const char *, int, __u32 *, int) =
145 str2hashbuf_signed;
146
147 /* Initialize the default seed for the hash checksum functions */
148 buf[0] = 0x67452301;
149 buf[1] = 0xefcdab89;
150 buf[2] = 0x98badcfe;
151 buf[3] = 0x10325476;
152
153 /* Check to see if the seed is all zero's */
154 if (hinfo->seed) {
155 for (i=0; i < 4; i++) {
156 if (hinfo->seed[i])
157 break;
158 }
159 if (i < 4)
160 memcpy(buf, hinfo->seed, sizeof(buf));
161 }
162
163 switch (hinfo->hash_version) {
164 case DX_HASH_LEGACY_UNSIGNED:
165 hash = dx_hack_hash_unsigned(name, len);
166 break;
167 case DX_HASH_LEGACY:
168 hash = dx_hack_hash_signed(name, len);
169 break;
170 case DX_HASH_HALF_MD4_UNSIGNED:
171 str2hashbuf = str2hashbuf_unsigned;
172 case DX_HASH_HALF_MD4:
173 p = name;
174 while (len > 0) {
175 (*str2hashbuf)(p, len, in, 8);
176 half_md4_transform(buf, in);
177 len -= 32;
178 p += 32;
179 }
180 minor_hash = buf[2];
181 hash = buf[1];
182 break;
183 case DX_HASH_TEA_UNSIGNED:
184 str2hashbuf = str2hashbuf_unsigned;
185 case DX_HASH_TEA:
186 p = name;
187 while (len > 0) {
188 (*str2hashbuf)(p, len, in, 4);
189 TEA_transform(buf, in);
190 len -= 16;
191 p += 16;
192 }
193 hash = buf[0];
194 minor_hash = buf[1];
195 break;
196 default:
197 hinfo->hash = 0;
198 return -1;
199 }
200 hash = hash & ~1;
201 if (hash == (EXT3_HTREE_EOF_32BIT << 1))
202 hash = (EXT3_HTREE_EOF_32BIT - 1) << 1;
203 hinfo->hash = hash;
204 hinfo->minor_hash = minor_hash;
205 return 0;
206}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
deleted file mode 100644
index 3ad242e5840e..000000000000
--- a/fs/ext3/ialloc.c
+++ /dev/null
@@ -1,706 +0,0 @@
1/*
2 * linux/fs/ext3/ialloc.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * BSD ufs-inspired inode and directory allocation by
10 * Stephen Tweedie (sct@redhat.com), 1993
11 * Big-endian to little-endian byte-swapping/bitmaps by
12 * David S. Miller (davem@caip.rutgers.edu), 1995
13 */
14
15#include <linux/quotaops.h>
16#include <linux/random.h>
17
18#include "ext3.h"
19#include "xattr.h"
20#include "acl.h"
21
22/*
23 * ialloc.c contains the inodes allocation and deallocation routines
24 */
25
26/*
27 * The free inodes are managed by bitmaps. A file system contains several
28 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
29 * block for inodes, N blocks for the inode table and data blocks.
30 *
31 * The file system contains group descriptors which are located after the
32 * super block. Each descriptor contains the number of the bitmap block and
33 * the free blocks count in the block.
34 */
35
36
37/*
38 * Read the inode allocation bitmap for a given block_group, reading
39 * into the specified slot in the superblock's bitmap cache.
40 *
41 * Return buffer_head of bitmap on success or NULL.
42 */
43static struct buffer_head *
44read_inode_bitmap(struct super_block * sb, unsigned long block_group)
45{
46 struct ext3_group_desc *desc;
47 struct buffer_head *bh = NULL;
48
49 desc = ext3_get_group_desc(sb, block_group, NULL);
50 if (!desc)
51 goto error_out;
52
53 bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap));
54 if (!bh)
55 ext3_error(sb, "read_inode_bitmap",
56 "Cannot read inode bitmap - "
57 "block_group = %lu, inode_bitmap = %u",
58 block_group, le32_to_cpu(desc->bg_inode_bitmap));
59error_out:
60 return bh;
61}
62
63/*
64 * NOTE! When we get the inode, we're the only people
65 * that have access to it, and as such there are no
66 * race conditions we have to worry about. The inode
67 * is not on the hash-lists, and it cannot be reached
68 * through the filesystem because the directory entry
69 * has been deleted earlier.
70 *
71 * HOWEVER: we must make sure that we get no aliases,
72 * which means that we have to call "clear_inode()"
73 * _before_ we mark the inode not in use in the inode
74 * bitmaps. Otherwise a newly created file might use
75 * the same inode number (not actually the same pointer
76 * though), and then we'd have two inodes sharing the
77 * same inode number and space on the harddisk.
78 */
79void ext3_free_inode (handle_t *handle, struct inode * inode)
80{
81 struct super_block * sb = inode->i_sb;
82 int is_directory;
83 unsigned long ino;
84 struct buffer_head *bitmap_bh = NULL;
85 struct buffer_head *bh2;
86 unsigned long block_group;
87 unsigned long bit;
88 struct ext3_group_desc * gdp;
89 struct ext3_super_block * es;
90 struct ext3_sb_info *sbi;
91 int fatal = 0, err;
92
93 if (atomic_read(&inode->i_count) > 1) {
94 printk ("ext3_free_inode: inode has count=%d\n",
95 atomic_read(&inode->i_count));
96 return;
97 }
98 if (inode->i_nlink) {
99 printk ("ext3_free_inode: inode has nlink=%d\n",
100 inode->i_nlink);
101 return;
102 }
103 if (!sb) {
104 printk("ext3_free_inode: inode on nonexistent device\n");
105 return;
106 }
107 sbi = EXT3_SB(sb);
108
109 ino = inode->i_ino;
110 ext3_debug ("freeing inode %lu\n", ino);
111 trace_ext3_free_inode(inode);
112
113 is_directory = S_ISDIR(inode->i_mode);
114
115 es = EXT3_SB(sb)->s_es;
116 if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
117 ext3_error (sb, "ext3_free_inode",
118 "reserved or nonexistent inode %lu", ino);
119 goto error_return;
120 }
121 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
122 bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
123 bitmap_bh = read_inode_bitmap(sb, block_group);
124 if (!bitmap_bh)
125 goto error_return;
126
127 BUFFER_TRACE(bitmap_bh, "get_write_access");
128 fatal = ext3_journal_get_write_access(handle, bitmap_bh);
129 if (fatal)
130 goto error_return;
131
132 /* Ok, now we can actually update the inode bitmaps.. */
133 if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
134 bit, bitmap_bh->b_data))
135 ext3_error (sb, "ext3_free_inode",
136 "bit already cleared for inode %lu", ino);
137 else {
138 gdp = ext3_get_group_desc (sb, block_group, &bh2);
139
140 BUFFER_TRACE(bh2, "get_write_access");
141 fatal = ext3_journal_get_write_access(handle, bh2);
142 if (fatal) goto error_return;
143
144 if (gdp) {
145 spin_lock(sb_bgl_lock(sbi, block_group));
146 le16_add_cpu(&gdp->bg_free_inodes_count, 1);
147 if (is_directory)
148 le16_add_cpu(&gdp->bg_used_dirs_count, -1);
149 spin_unlock(sb_bgl_lock(sbi, block_group));
150 percpu_counter_inc(&sbi->s_freeinodes_counter);
151 if (is_directory)
152 percpu_counter_dec(&sbi->s_dirs_counter);
153
154 }
155 BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
156 err = ext3_journal_dirty_metadata(handle, bh2);
157 if (!fatal) fatal = err;
158 }
159 BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata");
160 err = ext3_journal_dirty_metadata(handle, bitmap_bh);
161 if (!fatal)
162 fatal = err;
163
164error_return:
165 brelse(bitmap_bh);
166 ext3_std_error(sb, fatal);
167}
168
169/*
170 * Orlov's allocator for directories.
171 *
172 * We always try to spread first-level directories.
173 *
174 * If there are blockgroups with both free inodes and free blocks counts
175 * not worse than average we return one with smallest directory count.
176 * Otherwise we simply return a random group.
177 *
178 * For the rest rules look so:
179 *
180 * It's OK to put directory into a group unless
181 * it has too many directories already (max_dirs) or
182 * it has too few free inodes left (min_inodes) or
183 * it has too few free blocks left (min_blocks).
184 * Parent's group is preferred, if it doesn't satisfy these
185 * conditions we search cyclically through the rest. If none
186 * of the groups look good we just look for a group with more
187 * free inodes than average (starting at parent's group).
188 *
189 * Debt is incremented each time we allocate a directory and decremented
190 * when we allocate an inode, within 0--255.
191 */
192
193static int find_group_orlov(struct super_block *sb, struct inode *parent)
194{
195 int parent_group = EXT3_I(parent)->i_block_group;
196 struct ext3_sb_info *sbi = EXT3_SB(sb);
197 int ngroups = sbi->s_groups_count;
198 int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
199 unsigned int freei, avefreei;
200 ext3_fsblk_t freeb, avefreeb;
201 unsigned int ndirs;
202 int max_dirs, min_inodes;
203 ext3_grpblk_t min_blocks;
204 int group = -1, i;
205 struct ext3_group_desc *desc;
206
207 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
208 avefreei = freei / ngroups;
209 freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
210 avefreeb = freeb / ngroups;
211 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
212
213 if ((parent == d_inode(sb->s_root)) ||
214 (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) {
215 int best_ndir = inodes_per_group;
216 int best_group = -1;
217
218 group = prandom_u32();
219 parent_group = (unsigned)group % ngroups;
220 for (i = 0; i < ngroups; i++) {
221 group = (parent_group + i) % ngroups;
222 desc = ext3_get_group_desc (sb, group, NULL);
223 if (!desc || !desc->bg_free_inodes_count)
224 continue;
225 if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
226 continue;
227 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
228 continue;
229 if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
230 continue;
231 best_group = group;
232 best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
233 }
234 if (best_group >= 0)
235 return best_group;
236 goto fallback;
237 }
238
239 max_dirs = ndirs / ngroups + inodes_per_group / 16;
240 min_inodes = avefreei - inodes_per_group / 4;
241 min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
242
243 for (i = 0; i < ngroups; i++) {
244 group = (parent_group + i) % ngroups;
245 desc = ext3_get_group_desc (sb, group, NULL);
246 if (!desc || !desc->bg_free_inodes_count)
247 continue;
248 if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
249 continue;
250 if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
251 continue;
252 if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
253 continue;
254 return group;
255 }
256
257fallback:
258 for (i = 0; i < ngroups; i++) {
259 group = (parent_group + i) % ngroups;
260 desc = ext3_get_group_desc (sb, group, NULL);
261 if (!desc || !desc->bg_free_inodes_count)
262 continue;
263 if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
264 return group;
265 }
266
267 if (avefreei) {
268 /*
269 * The free-inodes counter is approximate, and for really small
270 * filesystems the above test can fail to find any blockgroups
271 */
272 avefreei = 0;
273 goto fallback;
274 }
275
276 return -1;
277}
278
279static int find_group_other(struct super_block *sb, struct inode *parent)
280{
281 int parent_group = EXT3_I(parent)->i_block_group;
282 int ngroups = EXT3_SB(sb)->s_groups_count;
283 struct ext3_group_desc *desc;
284 int group, i;
285
286 /*
287 * Try to place the inode in its parent directory
288 */
289 group = parent_group;
290 desc = ext3_get_group_desc (sb, group, NULL);
291 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
292 le16_to_cpu(desc->bg_free_blocks_count))
293 return group;
294
295 /*
296 * We're going to place this inode in a different blockgroup from its
297 * parent. We want to cause files in a common directory to all land in
298 * the same blockgroup. But we want files which are in a different
299 * directory which shares a blockgroup with our parent to land in a
300 * different blockgroup.
301 *
302 * So add our directory's i_ino into the starting point for the hash.
303 */
304 group = (group + parent->i_ino) % ngroups;
305
306 /*
307 * Use a quadratic hash to find a group with a free inode and some free
308 * blocks.
309 */
310 for (i = 1; i < ngroups; i <<= 1) {
311 group += i;
312 if (group >= ngroups)
313 group -= ngroups;
314 desc = ext3_get_group_desc (sb, group, NULL);
315 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
316 le16_to_cpu(desc->bg_free_blocks_count))
317 return group;
318 }
319
320 /*
321 * That failed: try linear search for a free inode, even if that group
322 * has no free blocks.
323 */
324 group = parent_group;
325 for (i = 0; i < ngroups; i++) {
326 if (++group >= ngroups)
327 group = 0;
328 desc = ext3_get_group_desc (sb, group, NULL);
329 if (desc && le16_to_cpu(desc->bg_free_inodes_count))
330 return group;
331 }
332
333 return -1;
334}
335
336/*
337 * There are two policies for allocating an inode. If the new inode is
338 * a directory, then a forward search is made for a block group with both
339 * free space and a low directory-to-inode ratio; if that fails, then of
340 * the groups with above-average free space, that group with the fewest
341 * directories already is chosen.
342 *
343 * For other inodes, search forward from the parent directory's block
344 * group to find a free inode.
345 */
346struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
347 const struct qstr *qstr, umode_t mode)
348{
349 struct super_block *sb;
350 struct buffer_head *bitmap_bh = NULL;
351 struct buffer_head *bh2;
352 int group;
353 unsigned long ino = 0;
354 struct inode * inode;
355 struct ext3_group_desc * gdp = NULL;
356 struct ext3_super_block * es;
357 struct ext3_inode_info *ei;
358 struct ext3_sb_info *sbi;
359 int err = 0;
360 struct inode *ret;
361 int i;
362
363 /* Cannot create files in a deleted directory */
364 if (!dir || !dir->i_nlink)
365 return ERR_PTR(-EPERM);
366
367 sb = dir->i_sb;
368 trace_ext3_request_inode(dir, mode);
369 inode = new_inode(sb);
370 if (!inode)
371 return ERR_PTR(-ENOMEM);
372 ei = EXT3_I(inode);
373
374 sbi = EXT3_SB(sb);
375 es = sbi->s_es;
376 if (S_ISDIR(mode))
377 group = find_group_orlov(sb, dir);
378 else
379 group = find_group_other(sb, dir);
380
381 err = -ENOSPC;
382 if (group == -1)
383 goto out;
384
385 for (i = 0; i < sbi->s_groups_count; i++) {
386 err = -EIO;
387
388 gdp = ext3_get_group_desc(sb, group, &bh2);
389 if (!gdp)
390 goto fail;
391
392 brelse(bitmap_bh);
393 bitmap_bh = read_inode_bitmap(sb, group);
394 if (!bitmap_bh)
395 goto fail;
396
397 ino = 0;
398
399repeat_in_this_group:
400 ino = ext3_find_next_zero_bit((unsigned long *)
401 bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino);
402 if (ino < EXT3_INODES_PER_GROUP(sb)) {
403
404 BUFFER_TRACE(bitmap_bh, "get_write_access");
405 err = ext3_journal_get_write_access(handle, bitmap_bh);
406 if (err)
407 goto fail;
408
409 if (!ext3_set_bit_atomic(sb_bgl_lock(sbi, group),
410 ino, bitmap_bh->b_data)) {
411 /* we won it */
412 BUFFER_TRACE(bitmap_bh,
413 "call ext3_journal_dirty_metadata");
414 err = ext3_journal_dirty_metadata(handle,
415 bitmap_bh);
416 if (err)
417 goto fail;
418 goto got;
419 }
420 /* we lost it */
421 journal_release_buffer(handle, bitmap_bh);
422
423 if (++ino < EXT3_INODES_PER_GROUP(sb))
424 goto repeat_in_this_group;
425 }
426
427 /*
428 * This case is possible in concurrent environment. It is very
429 * rare. We cannot repeat the find_group_xxx() call because
430 * that will simply return the same blockgroup, because the
431 * group descriptor metadata has not yet been updated.
432 * So we just go onto the next blockgroup.
433 */
434 if (++group == sbi->s_groups_count)
435 group = 0;
436 }
437 err = -ENOSPC;
438 goto out;
439
440got:
441 ino += group * EXT3_INODES_PER_GROUP(sb) + 1;
442 if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
443 ext3_error (sb, "ext3_new_inode",
444 "reserved inode or inode > inodes count - "
445 "block_group = %d, inode=%lu", group, ino);
446 err = -EIO;
447 goto fail;
448 }
449
450 BUFFER_TRACE(bh2, "get_write_access");
451 err = ext3_journal_get_write_access(handle, bh2);
452 if (err) goto fail;
453 spin_lock(sb_bgl_lock(sbi, group));
454 le16_add_cpu(&gdp->bg_free_inodes_count, -1);
455 if (S_ISDIR(mode)) {
456 le16_add_cpu(&gdp->bg_used_dirs_count, 1);
457 }
458 spin_unlock(sb_bgl_lock(sbi, group));
459 BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
460 err = ext3_journal_dirty_metadata(handle, bh2);
461 if (err) goto fail;
462
463 percpu_counter_dec(&sbi->s_freeinodes_counter);
464 if (S_ISDIR(mode))
465 percpu_counter_inc(&sbi->s_dirs_counter);
466
467
468 if (test_opt(sb, GRPID)) {
469 inode->i_mode = mode;
470 inode->i_uid = current_fsuid();
471 inode->i_gid = dir->i_gid;
472 } else
473 inode_init_owner(inode, dir, mode);
474
475 inode->i_ino = ino;
476 /* This is the optimal IO size (for stat), not the fs block size */
477 inode->i_blocks = 0;
478 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
479
480 memset(ei->i_data, 0, sizeof(ei->i_data));
481 ei->i_dir_start_lookup = 0;
482 ei->i_disksize = 0;
483
484 ei->i_flags =
485 ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
486#ifdef EXT3_FRAGMENTS
487 ei->i_faddr = 0;
488 ei->i_frag_no = 0;
489 ei->i_frag_size = 0;
490#endif
491 ei->i_file_acl = 0;
492 ei->i_dir_acl = 0;
493 ei->i_dtime = 0;
494 ei->i_block_alloc_info = NULL;
495 ei->i_block_group = group;
496
497 ext3_set_inode_flags(inode);
498 if (IS_DIRSYNC(inode))
499 handle->h_sync = 1;
500 if (insert_inode_locked(inode) < 0) {
501 /*
502 * Likely a bitmap corruption causing inode to be allocated
503 * twice.
504 */
505 err = -EIO;
506 goto fail;
507 }
508 spin_lock(&sbi->s_next_gen_lock);
509 inode->i_generation = sbi->s_next_generation++;
510 spin_unlock(&sbi->s_next_gen_lock);
511
512 ei->i_state_flags = 0;
513 ext3_set_inode_state(inode, EXT3_STATE_NEW);
514
515 /* See comment in ext3_iget for explanation */
516 if (ino >= EXT3_FIRST_INO(sb) + 1 &&
517 EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) {
518 ei->i_extra_isize =
519 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE;
520 } else {
521 ei->i_extra_isize = 0;
522 }
523
524 ret = inode;
525 dquot_initialize(inode);
526 err = dquot_alloc_inode(inode);
527 if (err)
528 goto fail_drop;
529
530 err = ext3_init_acl(handle, inode, dir);
531 if (err)
532 goto fail_free_drop;
533
534 err = ext3_init_security(handle, inode, dir, qstr);
535 if (err)
536 goto fail_free_drop;
537
538 err = ext3_mark_inode_dirty(handle, inode);
539 if (err) {
540 ext3_std_error(sb, err);
541 goto fail_free_drop;
542 }
543
544 ext3_debug("allocating inode %lu\n", inode->i_ino);
545 trace_ext3_allocate_inode(inode, dir, mode);
546 goto really_out;
547fail:
548 ext3_std_error(sb, err);
549out:
550 iput(inode);
551 ret = ERR_PTR(err);
552really_out:
553 brelse(bitmap_bh);
554 return ret;
555
556fail_free_drop:
557 dquot_free_inode(inode);
558
559fail_drop:
560 dquot_drop(inode);
561 inode->i_flags |= S_NOQUOTA;
562 clear_nlink(inode);
563 unlock_new_inode(inode);
564 iput(inode);
565 brelse(bitmap_bh);
566 return ERR_PTR(err);
567}
568
569/* Verify that we are loading a valid orphan from disk */
570struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
571{
572 unsigned long max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
573 unsigned long block_group;
574 int bit;
575 struct buffer_head *bitmap_bh;
576 struct inode *inode = NULL;
577 long err = -EIO;
578
579 /* Error cases - e2fsck has already cleaned up for us */
580 if (ino > max_ino) {
581 ext3_warning(sb, __func__,
582 "bad orphan ino %lu! e2fsck was run?", ino);
583 goto error;
584 }
585
586 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
587 bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
588 bitmap_bh = read_inode_bitmap(sb, block_group);
589 if (!bitmap_bh) {
590 ext3_warning(sb, __func__,
591 "inode bitmap error for orphan %lu", ino);
592 goto error;
593 }
594
595 /* Having the inode bit set should be a 100% indicator that this
596 * is a valid orphan (no e2fsck run on fs). Orphans also include
597 * inodes that were being truncated, so we can't check i_nlink==0.
598 */
599 if (!ext3_test_bit(bit, bitmap_bh->b_data))
600 goto bad_orphan;
601
602 inode = ext3_iget(sb, ino);
603 if (IS_ERR(inode))
604 goto iget_failed;
605
606 /*
607 * If the orphans has i_nlinks > 0 then it should be able to be
608 * truncated, otherwise it won't be removed from the orphan list
609 * during processing and an infinite loop will result.
610 */
611 if (inode->i_nlink && !ext3_can_truncate(inode))
612 goto bad_orphan;
613
614 if (NEXT_ORPHAN(inode) > max_ino)
615 goto bad_orphan;
616 brelse(bitmap_bh);
617 return inode;
618
619iget_failed:
620 err = PTR_ERR(inode);
621 inode = NULL;
622bad_orphan:
623 ext3_warning(sb, __func__,
624 "bad orphan inode %lu! e2fsck was run?", ino);
625 printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n",
626 bit, (unsigned long long)bitmap_bh->b_blocknr,
627 ext3_test_bit(bit, bitmap_bh->b_data));
628 printk(KERN_NOTICE "inode=%p\n", inode);
629 if (inode) {
630 printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
631 is_bad_inode(inode));
632 printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
633 NEXT_ORPHAN(inode));
634 printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
635 printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
636 /* Avoid freeing blocks if we got a bad deleted inode */
637 if (inode->i_nlink == 0)
638 inode->i_blocks = 0;
639 iput(inode);
640 }
641 brelse(bitmap_bh);
642error:
643 return ERR_PTR(err);
644}
645
646unsigned long ext3_count_free_inodes (struct super_block * sb)
647{
648 unsigned long desc_count;
649 struct ext3_group_desc *gdp;
650 int i;
651#ifdef EXT3FS_DEBUG
652 struct ext3_super_block *es;
653 unsigned long bitmap_count, x;
654 struct buffer_head *bitmap_bh = NULL;
655
656 es = EXT3_SB(sb)->s_es;
657 desc_count = 0;
658 bitmap_count = 0;
659 gdp = NULL;
660 for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
661 gdp = ext3_get_group_desc (sb, i, NULL);
662 if (!gdp)
663 continue;
664 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
665 brelse(bitmap_bh);
666 bitmap_bh = read_inode_bitmap(sb, i);
667 if (!bitmap_bh)
668 continue;
669
670 x = ext3_count_free(bitmap_bh, EXT3_INODES_PER_GROUP(sb) / 8);
671 printk("group %d: stored = %d, counted = %lu\n",
672 i, le16_to_cpu(gdp->bg_free_inodes_count), x);
673 bitmap_count += x;
674 }
675 brelse(bitmap_bh);
676 printk("ext3_count_free_inodes: stored = %u, computed = %lu, %lu\n",
677 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
678 return desc_count;
679#else
680 desc_count = 0;
681 for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
682 gdp = ext3_get_group_desc (sb, i, NULL);
683 if (!gdp)
684 continue;
685 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
686 cond_resched();
687 }
688 return desc_count;
689#endif
690}
691
692/* Called at mount-time, super-block is locked */
693unsigned long ext3_count_dirs (struct super_block * sb)
694{
695 unsigned long count = 0;
696 int i;
697
698 for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
699 struct ext3_group_desc *gdp = ext3_get_group_desc (sb, i, NULL);
700 if (!gdp)
701 continue;
702 count += le16_to_cpu(gdp->bg_used_dirs_count);
703 }
704 return count;
705}
706
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
deleted file mode 100644
index 6c7e5468a2f8..000000000000
--- a/fs/ext3/inode.c
+++ /dev/null
@@ -1,3574 +0,0 @@
1/*
2 * linux/fs/ext3/inode.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz)
21 *
22 * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23 */
24
25#include <linux/highuid.h>
26#include <linux/quotaops.h>
27#include <linux/writeback.h>
28#include <linux/mpage.h>
29#include <linux/namei.h>
30#include <linux/uio.h>
31#include "ext3.h"
32#include "xattr.h"
33#include "acl.h"
34
35static int ext3_writepage_trans_blocks(struct inode *inode);
36static int ext3_block_truncate_page(struct inode *inode, loff_t from);
37
38/*
39 * Test whether an inode is a fast symlink.
40 */
41static int ext3_inode_is_fast_symlink(struct inode *inode)
42{
43 int ea_blocks = EXT3_I(inode)->i_file_acl ?
44 (inode->i_sb->s_blocksize >> 9) : 0;
45
46 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
47}
48
49/*
50 * The ext3 forget function must perform a revoke if we are freeing data
51 * which has been journaled. Metadata (eg. indirect blocks) must be
52 * revoked in all cases.
53 *
54 * "bh" may be NULL: a metadata block may have been freed from memory
55 * but there may still be a record of it in the journal, and that record
56 * still needs to be revoked.
57 */
58int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
59 struct buffer_head *bh, ext3_fsblk_t blocknr)
60{
61 int err;
62
63 might_sleep();
64
65 trace_ext3_forget(inode, is_metadata, blocknr);
66 BUFFER_TRACE(bh, "enter");
67
68 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
69 "data mode %lx\n",
70 bh, is_metadata, inode->i_mode,
71 test_opt(inode->i_sb, DATA_FLAGS));
72
73 /* Never use the revoke function if we are doing full data
74 * journaling: there is no need to, and a V1 superblock won't
75 * support it. Otherwise, only skip the revoke on un-journaled
76 * data blocks. */
77
78 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
79 (!is_metadata && !ext3_should_journal_data(inode))) {
80 if (bh) {
81 BUFFER_TRACE(bh, "call journal_forget");
82 return ext3_journal_forget(handle, bh);
83 }
84 return 0;
85 }
86
87 /*
88 * data!=journal && (is_metadata || should_journal_data(inode))
89 */
90 BUFFER_TRACE(bh, "call ext3_journal_revoke");
91 err = ext3_journal_revoke(handle, blocknr, bh);
92 if (err)
93 ext3_abort(inode->i_sb, __func__,
94 "error %d when attempting revoke", err);
95 BUFFER_TRACE(bh, "exit");
96 return err;
97}
98
99/*
100 * Work out how many blocks we need to proceed with the next chunk of a
101 * truncate transaction.
102 */
103static unsigned long blocks_for_truncate(struct inode *inode)
104{
105 unsigned long needed;
106
107 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
108
109 /* Give ourselves just enough room to cope with inodes in which
110 * i_blocks is corrupt: we've seen disk corruptions in the past
111 * which resulted in random data in an inode which looked enough
112 * like a regular file for ext3 to try to delete it. Things
113 * will go a bit crazy if that happens, but at least we should
114 * try not to panic the whole kernel. */
115 if (needed < 2)
116 needed = 2;
117
118 /* But we need to bound the transaction so we don't overflow the
119 * journal. */
120 if (needed > EXT3_MAX_TRANS_DATA)
121 needed = EXT3_MAX_TRANS_DATA;
122
123 return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
124}
125
126/*
127 * Truncate transactions can be complex and absolutely huge. So we need to
128 * be able to restart the transaction at a conventient checkpoint to make
129 * sure we don't overflow the journal.
130 *
131 * start_transaction gets us a new handle for a truncate transaction,
132 * and extend_transaction tries to extend the existing one a bit. If
133 * extend fails, we need to propagate the failure up and restart the
134 * transaction in the top-level truncate loop. --sct
135 */
136static handle_t *start_transaction(struct inode *inode)
137{
138 handle_t *result;
139
140 result = ext3_journal_start(inode, blocks_for_truncate(inode));
141 if (!IS_ERR(result))
142 return result;
143
144 ext3_std_error(inode->i_sb, PTR_ERR(result));
145 return result;
146}
147
148/*
149 * Try to extend this transaction for the purposes of truncation.
150 *
151 * Returns 0 if we managed to create more room. If we can't create more
152 * room, and the transaction must be restarted we return 1.
153 */
154static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
155{
156 if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
157 return 0;
158 if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
159 return 0;
160 return 1;
161}
162
163/*
164 * Restart the transaction associated with *handle. This does a commit,
165 * so before we call here everything must be consistently dirtied against
166 * this transaction.
167 */
168static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
169{
170 int ret;
171
172 jbd_debug(2, "restarting handle %p\n", handle);
173 /*
174 * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
175 * At this moment, get_block can be called only for blocks inside
176 * i_size since page cache has been already dropped and writes are
177 * blocked by i_mutex. So we can safely drop the truncate_mutex.
178 */
179 mutex_unlock(&EXT3_I(inode)->truncate_mutex);
180 ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
181 mutex_lock(&EXT3_I(inode)->truncate_mutex);
182 return ret;
183}
184
185/*
186 * Called at inode eviction from icache
187 */
188void ext3_evict_inode (struct inode *inode)
189{
190 struct ext3_inode_info *ei = EXT3_I(inode);
191 struct ext3_block_alloc_info *rsv;
192 handle_t *handle;
193 int want_delete = 0;
194
195 trace_ext3_evict_inode(inode);
196 if (!inode->i_nlink && !is_bad_inode(inode)) {
197 dquot_initialize(inode);
198 want_delete = 1;
199 }
200
201 /*
202 * When journalling data dirty buffers are tracked only in the journal.
203 * So although mm thinks everything is clean and ready for reaping the
204 * inode might still have some pages to write in the running
205 * transaction or waiting to be checkpointed. Thus calling
206 * journal_invalidatepage() (via truncate_inode_pages()) to discard
207 * these buffers can cause data loss. Also even if we did not discard
208 * these buffers, we would have no way to find them after the inode
209 * is reaped and thus user could see stale data if he tries to read
210 * them before the transaction is checkpointed. So be careful and
211 * force everything to disk here... We use ei->i_datasync_tid to
212 * store the newest transaction containing inode's data.
213 *
214 * Note that directories do not have this problem because they don't
215 * use page cache.
216 *
217 * The s_journal check handles the case when ext3_get_journal() fails
218 * and puts the journal inode.
219 */
220 if (inode->i_nlink && ext3_should_journal_data(inode) &&
221 EXT3_SB(inode->i_sb)->s_journal &&
222 (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
223 inode->i_ino != EXT3_JOURNAL_INO) {
224 tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
225 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
226
227 log_start_commit(journal, commit_tid);
228 log_wait_commit(journal, commit_tid);
229 filemap_write_and_wait(&inode->i_data);
230 }
231 truncate_inode_pages_final(&inode->i_data);
232
233 ext3_discard_reservation(inode);
234 rsv = ei->i_block_alloc_info;
235 ei->i_block_alloc_info = NULL;
236 if (unlikely(rsv))
237 kfree(rsv);
238
239 if (!want_delete)
240 goto no_delete;
241
242 handle = start_transaction(inode);
243 if (IS_ERR(handle)) {
244 /*
245 * If we're going to skip the normal cleanup, we still need to
246 * make sure that the in-core orphan linked list is properly
247 * cleaned up.
248 */
249 ext3_orphan_del(NULL, inode);
250 goto no_delete;
251 }
252
253 if (IS_SYNC(inode))
254 handle->h_sync = 1;
255 inode->i_size = 0;
256 if (inode->i_blocks)
257 ext3_truncate(inode);
258 /*
259 * Kill off the orphan record created when the inode lost the last
260 * link. Note that ext3_orphan_del() has to be able to cope with the
261 * deletion of a non-existent orphan - ext3_truncate() could
262 * have removed the record.
263 */
264 ext3_orphan_del(handle, inode);
265 ei->i_dtime = get_seconds();
266
267 /*
268 * One subtle ordering requirement: if anything has gone wrong
269 * (transaction abort, IO errors, whatever), then we can still
270 * do these next steps (the fs will already have been marked as
271 * having errors), but we can't free the inode if the mark_dirty
272 * fails.
273 */
274 if (ext3_mark_inode_dirty(handle, inode)) {
275 /* If that failed, just dquot_drop() and be done with that */
276 dquot_drop(inode);
277 clear_inode(inode);
278 } else {
279 ext3_xattr_delete_inode(handle, inode);
280 dquot_free_inode(inode);
281 dquot_drop(inode);
282 clear_inode(inode);
283 ext3_free_inode(handle, inode);
284 }
285 ext3_journal_stop(handle);
286 return;
287no_delete:
288 clear_inode(inode);
289 dquot_drop(inode);
290}
291
292typedef struct {
293 __le32 *p;
294 __le32 key;
295 struct buffer_head *bh;
296} Indirect;
297
298static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
299{
300 p->key = *(p->p = v);
301 p->bh = bh;
302}
303
304static int verify_chain(Indirect *from, Indirect *to)
305{
306 while (from <= to && from->key == *from->p)
307 from++;
308 return (from > to);
309}
310
311/**
312 * ext3_block_to_path - parse the block number into array of offsets
313 * @inode: inode in question (we are only interested in its superblock)
314 * @i_block: block number to be parsed
315 * @offsets: array to store the offsets in
316 * @boundary: set this non-zero if the referred-to block is likely to be
317 * followed (on disk) by an indirect block.
318 *
319 * To store the locations of file's data ext3 uses a data structure common
320 * for UNIX filesystems - tree of pointers anchored in the inode, with
321 * data blocks at leaves and indirect blocks in intermediate nodes.
322 * This function translates the block number into path in that tree -
323 * return value is the path length and @offsets[n] is the offset of
324 * pointer to (n+1)th node in the nth one. If @block is out of range
325 * (negative or too large) warning is printed and zero returned.
326 *
327 * Note: function doesn't find node addresses, so no IO is needed. All
328 * we need to know is the capacity of indirect blocks (taken from the
329 * inode->i_sb).
330 */
331
332/*
333 * Portability note: the last comparison (check that we fit into triple
334 * indirect block) is spelled differently, because otherwise on an
335 * architecture with 32-bit longs and 8Kb pages we might get into trouble
336 * if our filesystem had 8Kb blocks. We might use long long, but that would
337 * kill us on x86. Oh, well, at least the sign propagation does not matter -
338 * i_block would have to be negative in the very beginning, so we would not
339 * get there at all.
340 */
341
342static int ext3_block_to_path(struct inode *inode,
343 long i_block, int offsets[4], int *boundary)
344{
345 int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
346 int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
347 const long direct_blocks = EXT3_NDIR_BLOCKS,
348 indirect_blocks = ptrs,
349 double_blocks = (1 << (ptrs_bits * 2));
350 int n = 0;
351 int final = 0;
352
353 if (i_block < 0) {
354 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
355 } else if (i_block < direct_blocks) {
356 offsets[n++] = i_block;
357 final = direct_blocks;
358 } else if ( (i_block -= direct_blocks) < indirect_blocks) {
359 offsets[n++] = EXT3_IND_BLOCK;
360 offsets[n++] = i_block;
361 final = ptrs;
362 } else if ((i_block -= indirect_blocks) < double_blocks) {
363 offsets[n++] = EXT3_DIND_BLOCK;
364 offsets[n++] = i_block >> ptrs_bits;
365 offsets[n++] = i_block & (ptrs - 1);
366 final = ptrs;
367 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
368 offsets[n++] = EXT3_TIND_BLOCK;
369 offsets[n++] = i_block >> (ptrs_bits * 2);
370 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
371 offsets[n++] = i_block & (ptrs - 1);
372 final = ptrs;
373 } else {
374 ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big");
375 }
376 if (boundary)
377 *boundary = final - 1 - (i_block & (ptrs - 1));
378 return n;
379}
380
381/**
382 * ext3_get_branch - read the chain of indirect blocks leading to data
383 * @inode: inode in question
384 * @depth: depth of the chain (1 - direct pointer, etc.)
385 * @offsets: offsets of pointers in inode/indirect blocks
386 * @chain: place to store the result
387 * @err: here we store the error value
388 *
389 * Function fills the array of triples <key, p, bh> and returns %NULL
390 * if everything went OK or the pointer to the last filled triple
391 * (incomplete one) otherwise. Upon the return chain[i].key contains
392 * the number of (i+1)-th block in the chain (as it is stored in memory,
393 * i.e. little-endian 32-bit), chain[i].p contains the address of that
394 * number (it points into struct inode for i==0 and into the bh->b_data
395 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
396 * block for i>0 and NULL for i==0. In other words, it holds the block
397 * numbers of the chain, addresses they were taken from (and where we can
398 * verify that chain did not change) and buffer_heads hosting these
399 * numbers.
400 *
401 * Function stops when it stumbles upon zero pointer (absent block)
402 * (pointer to last triple returned, *@err == 0)
403 * or when it gets an IO error reading an indirect block
404 * (ditto, *@err == -EIO)
405 * or when it notices that chain had been changed while it was reading
406 * (ditto, *@err == -EAGAIN)
407 * or when it reads all @depth-1 indirect blocks successfully and finds
408 * the whole chain, all way to the data (returns %NULL, *err == 0).
409 */
410static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
411 Indirect chain[4], int *err)
412{
413 struct super_block *sb = inode->i_sb;
414 Indirect *p = chain;
415 struct buffer_head *bh;
416
417 *err = 0;
418 /* i_data is not going away, no lock needed */
419 add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
420 if (!p->key)
421 goto no_block;
422 while (--depth) {
423 bh = sb_bread(sb, le32_to_cpu(p->key));
424 if (!bh)
425 goto failure;
426 /* Reader: pointers */
427 if (!verify_chain(chain, p))
428 goto changed;
429 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
430 /* Reader: end */
431 if (!p->key)
432 goto no_block;
433 }
434 return NULL;
435
436changed:
437 brelse(bh);
438 *err = -EAGAIN;
439 goto no_block;
440failure:
441 *err = -EIO;
442no_block:
443 return p;
444}
445
446/**
447 * ext3_find_near - find a place for allocation with sufficient locality
448 * @inode: owner
449 * @ind: descriptor of indirect block.
450 *
451 * This function returns the preferred place for block allocation.
452 * It is used when heuristic for sequential allocation fails.
453 * Rules are:
454 * + if there is a block to the left of our position - allocate near it.
455 * + if pointer will live in indirect block - allocate near that block.
456 * + if pointer will live in inode - allocate in the same
457 * cylinder group.
458 *
459 * In the latter case we colour the starting block by the callers PID to
460 * prevent it from clashing with concurrent allocations for a different inode
461 * in the same block group. The PID is used here so that functionally related
462 * files will be close-by on-disk.
463 *
464 * Caller must make sure that @ind is valid and will stay that way.
465 */
466static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
467{
468 struct ext3_inode_info *ei = EXT3_I(inode);
469 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
470 __le32 *p;
471 ext3_fsblk_t bg_start;
472 ext3_grpblk_t colour;
473
474 /* Try to find previous block */
475 for (p = ind->p - 1; p >= start; p--) {
476 if (*p)
477 return le32_to_cpu(*p);
478 }
479
480 /* No such thing, so let's try location of indirect block */
481 if (ind->bh)
482 return ind->bh->b_blocknr;
483
484 /*
485 * It is going to be referred to from the inode itself? OK, just put it
486 * into the same cylinder group then.
487 */
488 bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
489 colour = (current->pid % 16) *
490 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
491 return bg_start + colour;
492}
493
494/**
495 * ext3_find_goal - find a preferred place for allocation.
496 * @inode: owner
497 * @block: block we want
498 * @partial: pointer to the last triple within a chain
499 *
500 * Normally this function find the preferred place for block allocation,
501 * returns it.
502 */
503
504static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
505 Indirect *partial)
506{
507 struct ext3_block_alloc_info *block_i;
508
509 block_i = EXT3_I(inode)->i_block_alloc_info;
510
511 /*
512 * try the heuristic for sequential allocation,
513 * failing that at least try to get decent locality.
514 */
515 if (block_i && (block == block_i->last_alloc_logical_block + 1)
516 && (block_i->last_alloc_physical_block != 0)) {
517 return block_i->last_alloc_physical_block + 1;
518 }
519
520 return ext3_find_near(inode, partial);
521}
522
523/**
524 * ext3_blks_to_allocate - Look up the block map and count the number
525 * of direct blocks need to be allocated for the given branch.
526 *
527 * @branch: chain of indirect blocks
528 * @k: number of blocks need for indirect blocks
529 * @blks: number of data blocks to be mapped.
530 * @blocks_to_boundary: the offset in the indirect block
531 *
532 * return the total number of blocks to be allocate, including the
533 * direct and indirect blocks.
534 */
535static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
536 int blocks_to_boundary)
537{
538 unsigned long count = 0;
539
540 /*
541 * Simple case, [t,d]Indirect block(s) has not allocated yet
542 * then it's clear blocks on that path have not allocated
543 */
544 if (k > 0) {
545 /* right now we don't handle cross boundary allocation */
546 if (blks < blocks_to_boundary + 1)
547 count += blks;
548 else
549 count += blocks_to_boundary + 1;
550 return count;
551 }
552
553 count++;
554 while (count < blks && count <= blocks_to_boundary &&
555 le32_to_cpu(*(branch[0].p + count)) == 0) {
556 count++;
557 }
558 return count;
559}
560
561/**
562 * ext3_alloc_blocks - multiple allocate blocks needed for a branch
563 * @handle: handle for this transaction
564 * @inode: owner
565 * @goal: preferred place for allocation
566 * @indirect_blks: the number of blocks need to allocate for indirect
567 * blocks
568 * @blks: number of blocks need to allocated for direct blocks
569 * @new_blocks: on return it will store the new block numbers for
570 * the indirect blocks(if needed) and the first direct block,
571 * @err: here we store the error value
572 *
573 * return the number of direct blocks allocated
574 */
575static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
576 ext3_fsblk_t goal, int indirect_blks, int blks,
577 ext3_fsblk_t new_blocks[4], int *err)
578{
579 int target, i;
580 unsigned long count = 0;
581 int index = 0;
582 ext3_fsblk_t current_block = 0;
583 int ret = 0;
584
585 /*
586 * Here we try to allocate the requested multiple blocks at once,
587 * on a best-effort basis.
588 * To build a branch, we should allocate blocks for
589 * the indirect blocks(if not allocated yet), and at least
590 * the first direct block of this branch. That's the
591 * minimum number of blocks need to allocate(required)
592 */
593 target = blks + indirect_blks;
594
595 while (1) {
596 count = target;
597 /* allocating blocks for indirect blocks and direct blocks */
598 current_block = ext3_new_blocks(handle,inode,goal,&count,err);
599 if (*err)
600 goto failed_out;
601
602 target -= count;
603 /* allocate blocks for indirect blocks */
604 while (index < indirect_blks && count) {
605 new_blocks[index++] = current_block++;
606 count--;
607 }
608
609 if (count > 0)
610 break;
611 }
612
613 /* save the new block number for the first direct block */
614 new_blocks[index] = current_block;
615
616 /* total number of blocks allocated for direct blocks */
617 ret = count;
618 *err = 0;
619 return ret;
620failed_out:
621 for (i = 0; i <index; i++)
622 ext3_free_blocks(handle, inode, new_blocks[i], 1);
623 return ret;
624}
625
626/**
627 * ext3_alloc_branch - allocate and set up a chain of blocks.
628 * @handle: handle for this transaction
629 * @inode: owner
630 * @indirect_blks: number of allocated indirect blocks
631 * @blks: number of allocated direct blocks
632 * @goal: preferred place for allocation
633 * @offsets: offsets (in the blocks) to store the pointers to next.
634 * @branch: place to store the chain in.
635 *
636 * This function allocates blocks, zeroes out all but the last one,
637 * links them into chain and (if we are synchronous) writes them to disk.
638 * In other words, it prepares a branch that can be spliced onto the
639 * inode. It stores the information about that chain in the branch[], in
640 * the same format as ext3_get_branch() would do. We are calling it after
641 * we had read the existing part of chain and partial points to the last
642 * triple of that (one with zero ->key). Upon the exit we have the same
643 * picture as after the successful ext3_get_block(), except that in one
644 * place chain is disconnected - *branch->p is still zero (we did not
645 * set the last link), but branch->key contains the number that should
646 * be placed into *branch->p to fill that gap.
647 *
648 * If allocation fails we free all blocks we've allocated (and forget
649 * their buffer_heads) and return the error value the from failed
650 * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
651 * as described above and return 0.
652 */
653static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
654 int indirect_blks, int *blks, ext3_fsblk_t goal,
655 int *offsets, Indirect *branch)
656{
657 int blocksize = inode->i_sb->s_blocksize;
658 int i, n = 0;
659 int err = 0;
660 struct buffer_head *bh;
661 int num;
662 ext3_fsblk_t new_blocks[4];
663 ext3_fsblk_t current_block;
664
665 num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
666 *blks, new_blocks, &err);
667 if (err)
668 return err;
669
670 branch[0].key = cpu_to_le32(new_blocks[0]);
671 /*
672 * metadata blocks and data blocks are allocated.
673 */
674 for (n = 1; n <= indirect_blks; n++) {
675 /*
676 * Get buffer_head for parent block, zero it out
677 * and set the pointer to new one, then send
678 * parent to disk.
679 */
680 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
681 if (unlikely(!bh)) {
682 err = -ENOMEM;
683 goto failed;
684 }
685 branch[n].bh = bh;
686 lock_buffer(bh);
687 BUFFER_TRACE(bh, "call get_create_access");
688 err = ext3_journal_get_create_access(handle, bh);
689 if (err) {
690 unlock_buffer(bh);
691 brelse(bh);
692 goto failed;
693 }
694
695 memset(bh->b_data, 0, blocksize);
696 branch[n].p = (__le32 *) bh->b_data + offsets[n];
697 branch[n].key = cpu_to_le32(new_blocks[n]);
698 *branch[n].p = branch[n].key;
699 if ( n == indirect_blks) {
700 current_block = new_blocks[n];
701 /*
702 * End of chain, update the last new metablock of
703 * the chain to point to the new allocated
704 * data blocks numbers
705 */
706 for (i=1; i < num; i++)
707 *(branch[n].p + i) = cpu_to_le32(++current_block);
708 }
709 BUFFER_TRACE(bh, "marking uptodate");
710 set_buffer_uptodate(bh);
711 unlock_buffer(bh);
712
713 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
714 err = ext3_journal_dirty_metadata(handle, bh);
715 if (err)
716 goto failed;
717 }
718 *blks = num;
719 return err;
720failed:
721 /* Allocation failed, free what we already allocated */
722 for (i = 1; i <= n ; i++) {
723 BUFFER_TRACE(branch[i].bh, "call journal_forget");
724 ext3_journal_forget(handle, branch[i].bh);
725 }
726 for (i = 0; i < indirect_blks; i++)
727 ext3_free_blocks(handle, inode, new_blocks[i], 1);
728
729 ext3_free_blocks(handle, inode, new_blocks[i], num);
730
731 return err;
732}
733
734/**
735 * ext3_splice_branch - splice the allocated branch onto inode.
736 * @handle: handle for this transaction
737 * @inode: owner
738 * @block: (logical) number of block we are adding
739 * @where: location of missing link
740 * @num: number of indirect blocks we are adding
741 * @blks: number of direct blocks we are adding
742 *
743 * This function fills the missing link and does all housekeeping needed in
744 * inode (->i_blocks, etc.). In case of success we end up with the full
745 * chain to new block and return 0.
746 */
747static int ext3_splice_branch(handle_t *handle, struct inode *inode,
748 long block, Indirect *where, int num, int blks)
749{
750 int i;
751 int err = 0;
752 struct ext3_block_alloc_info *block_i;
753 ext3_fsblk_t current_block;
754 struct ext3_inode_info *ei = EXT3_I(inode);
755 struct timespec now;
756
757 block_i = ei->i_block_alloc_info;
758 /*
759 * If we're splicing into a [td]indirect block (as opposed to the
760 * inode) then we need to get write access to the [td]indirect block
761 * before the splice.
762 */
763 if (where->bh) {
764 BUFFER_TRACE(where->bh, "get_write_access");
765 err = ext3_journal_get_write_access(handle, where->bh);
766 if (err)
767 goto err_out;
768 }
769 /* That's it */
770
771 *where->p = where->key;
772
773 /*
774 * Update the host buffer_head or inode to point to more just allocated
775 * direct blocks blocks
776 */
777 if (num == 0 && blks > 1) {
778 current_block = le32_to_cpu(where->key) + 1;
779 for (i = 1; i < blks; i++)
780 *(where->p + i ) = cpu_to_le32(current_block++);
781 }
782
783 /*
784 * update the most recently allocated logical & physical block
785 * in i_block_alloc_info, to assist find the proper goal block for next
786 * allocation
787 */
788 if (block_i) {
789 block_i->last_alloc_logical_block = block + blks - 1;
790 block_i->last_alloc_physical_block =
791 le32_to_cpu(where[num].key) + blks - 1;
792 }
793
794 /* We are done with atomic stuff, now do the rest of housekeeping */
795 now = CURRENT_TIME_SEC;
796 if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) {
797 inode->i_ctime = now;
798 ext3_mark_inode_dirty(handle, inode);
799 }
800 /* ext3_mark_inode_dirty already updated i_sync_tid */
801 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
802
803 /* had we spliced it onto indirect block? */
804 if (where->bh) {
805 /*
806 * If we spliced it onto an indirect block, we haven't
807 * altered the inode. Note however that if it is being spliced
808 * onto an indirect block at the very end of the file (the
809 * file is growing) then we *will* alter the inode to reflect
810 * the new i_size. But that is not done here - it is done in
811 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
812 */
813 jbd_debug(5, "splicing indirect only\n");
814 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
815 err = ext3_journal_dirty_metadata(handle, where->bh);
816 if (err)
817 goto err_out;
818 } else {
819 /*
820 * OK, we spliced it into the inode itself on a direct block.
821 * Inode was dirtied above.
822 */
823 jbd_debug(5, "splicing direct\n");
824 }
825 return err;
826
827err_out:
828 for (i = 1; i <= num; i++) {
829 BUFFER_TRACE(where[i].bh, "call journal_forget");
830 ext3_journal_forget(handle, where[i].bh);
831 ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
832 }
833 ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
834
835 return err;
836}
837
838/*
839 * Allocation strategy is simple: if we have to allocate something, we will
840 * have to go the whole way to leaf. So let's do it before attaching anything
841 * to tree, set linkage between the newborn blocks, write them if sync is
842 * required, recheck the path, free and repeat if check fails, otherwise
843 * set the last missing link (that will protect us from any truncate-generated
844 * removals - all blocks on the path are immune now) and possibly force the
845 * write on the parent block.
846 * That has a nice additional property: no special recovery from the failed
847 * allocations is needed - we simply release blocks and do not touch anything
848 * reachable from inode.
849 *
850 * `handle' can be NULL if create == 0.
851 *
852 * The BKL may not be held on entry here. Be sure to take it early.
853 * return > 0, # of blocks mapped or allocated.
854 * return = 0, if plain lookup failed.
855 * return < 0, error case.
856 */
857int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
858 sector_t iblock, unsigned long maxblocks,
859 struct buffer_head *bh_result,
860 int create)
861{
862 int err = -EIO;
863 int offsets[4];
864 Indirect chain[4];
865 Indirect *partial;
866 ext3_fsblk_t goal;
867 int indirect_blks;
868 int blocks_to_boundary = 0;
869 int depth;
870 struct ext3_inode_info *ei = EXT3_I(inode);
871 int count = 0;
872 ext3_fsblk_t first_block = 0;
873
874
875 trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create);
876 J_ASSERT(handle != NULL || create == 0);
877 depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
878
879 if (depth == 0)
880 goto out;
881
882 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
883
884 /* Simplest case - block found, no allocation needed */
885 if (!partial) {
886 first_block = le32_to_cpu(chain[depth - 1].key);
887 clear_buffer_new(bh_result);
888 count++;
889 /*map more blocks*/
890 while (count < maxblocks && count <= blocks_to_boundary) {
891 ext3_fsblk_t blk;
892
893 if (!verify_chain(chain, chain + depth - 1)) {
894 /*
895 * Indirect block might be removed by
896 * truncate while we were reading it.
897 * Handling of that case: forget what we've
898 * got now. Flag the err as EAGAIN, so it
899 * will reread.
900 */
901 err = -EAGAIN;
902 count = 0;
903 break;
904 }
905 blk = le32_to_cpu(*(chain[depth-1].p + count));
906
907 if (blk == first_block + count)
908 count++;
909 else
910 break;
911 }
912 if (err != -EAGAIN)
913 goto got_it;
914 }
915
916 /* Next simple case - plain lookup or failed read of indirect block */
917 if (!create || err == -EIO)
918 goto cleanup;
919
920 /*
921 * Block out ext3_truncate while we alter the tree
922 */
923 mutex_lock(&ei->truncate_mutex);
924
925 /*
926 * If the indirect block is missing while we are reading
927 * the chain(ext3_get_branch() returns -EAGAIN err), or
928 * if the chain has been changed after we grab the semaphore,
929 * (either because another process truncated this branch, or
930 * another get_block allocated this branch) re-grab the chain to see if
931 * the request block has been allocated or not.
932 *
933 * Since we already block the truncate/other get_block
934 * at this point, we will have the current copy of the chain when we
935 * splice the branch into the tree.
936 */
937 if (err == -EAGAIN || !verify_chain(chain, partial)) {
938 while (partial > chain) {
939 brelse(partial->bh);
940 partial--;
941 }
942 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
943 if (!partial) {
944 count++;
945 mutex_unlock(&ei->truncate_mutex);
946 if (err)
947 goto cleanup;
948 clear_buffer_new(bh_result);
949 goto got_it;
950 }
951 }
952
953 /*
954 * Okay, we need to do block allocation. Lazily initialize the block
955 * allocation info here if necessary
956 */
957 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
958 ext3_init_block_alloc_info(inode);
959
960 goal = ext3_find_goal(inode, iblock, partial);
961
962 /* the number of blocks need to allocate for [d,t]indirect blocks */
963 indirect_blks = (chain + depth) - partial - 1;
964
965 /*
966 * Next look up the indirect map to count the totoal number of
967 * direct blocks to allocate for this branch.
968 */
969 count = ext3_blks_to_allocate(partial, indirect_blks,
970 maxblocks, blocks_to_boundary);
971 err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
972 offsets + (partial - chain), partial);
973
974 /*
975 * The ext3_splice_branch call will free and forget any buffers
976 * on the new chain if there is a failure, but that risks using
977 * up transaction credits, especially for bitmaps where the
978 * credits cannot be returned. Can we handle this somehow? We
979 * may need to return -EAGAIN upwards in the worst case. --sct
980 */
981 if (!err)
982 err = ext3_splice_branch(handle, inode, iblock,
983 partial, indirect_blks, count);
984 mutex_unlock(&ei->truncate_mutex);
985 if (err)
986 goto cleanup;
987
988 set_buffer_new(bh_result);
989got_it:
990 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
991 if (count > blocks_to_boundary)
992 set_buffer_boundary(bh_result);
993 err = count;
994 /* Clean up and exit */
995 partial = chain + depth - 1; /* the whole chain */
996cleanup:
997 while (partial > chain) {
998 BUFFER_TRACE(partial->bh, "call brelse");
999 brelse(partial->bh);
1000 partial--;
1001 }
1002 BUFFER_TRACE(bh_result, "returned");
1003out:
1004 trace_ext3_get_blocks_exit(inode, iblock,
1005 depth ? le32_to_cpu(chain[depth-1].key) : 0,
1006 count, err);
1007 return err;
1008}
1009
1010/* Maximum number of blocks we map for direct IO at once. */
1011#define DIO_MAX_BLOCKS 4096
1012/*
1013 * Number of credits we need for writing DIO_MAX_BLOCKS:
1014 * We need sb + group descriptor + bitmap + inode -> 4
1015 * For B blocks with A block pointers per block we need:
1016 * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
1017 * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
1018 */
1019#define DIO_CREDITS 25
1020
1021static int ext3_get_block(struct inode *inode, sector_t iblock,
1022 struct buffer_head *bh_result, int create)
1023{
1024 handle_t *handle = ext3_journal_current_handle();
1025 int ret = 0, started = 0;
1026 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1027
1028 if (create && !handle) { /* Direct IO write... */
1029 if (max_blocks > DIO_MAX_BLOCKS)
1030 max_blocks = DIO_MAX_BLOCKS;
1031 handle = ext3_journal_start(inode, DIO_CREDITS +
1032 EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
1033 if (IS_ERR(handle)) {
1034 ret = PTR_ERR(handle);
1035 goto out;
1036 }
1037 started = 1;
1038 }
1039
1040 ret = ext3_get_blocks_handle(handle, inode, iblock,
1041 max_blocks, bh_result, create);
1042 if (ret > 0) {
1043 bh_result->b_size = (ret << inode->i_blkbits);
1044 ret = 0;
1045 }
1046 if (started)
1047 ext3_journal_stop(handle);
1048out:
1049 return ret;
1050}
1051
1052int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1053 u64 start, u64 len)
1054{
1055 return generic_block_fiemap(inode, fieinfo, start, len,
1056 ext3_get_block);
1057}
1058
1059/*
1060 * `handle' can be NULL if create is zero
1061 */
1062struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
1063 long block, int create, int *errp)
1064{
1065 struct buffer_head dummy;
1066 int fatal = 0, err;
1067
1068 J_ASSERT(handle != NULL || create == 0);
1069
1070 dummy.b_state = 0;
1071 dummy.b_blocknr = -1000;
1072 buffer_trace_init(&dummy.b_history);
1073 err = ext3_get_blocks_handle(handle, inode, block, 1,
1074 &dummy, create);
1075 /*
1076 * ext3_get_blocks_handle() returns number of blocks
1077 * mapped. 0 in case of a HOLE.
1078 */
1079 if (err > 0) {
1080 WARN_ON(err > 1);
1081 err = 0;
1082 }
1083 *errp = err;
1084 if (!err && buffer_mapped(&dummy)) {
1085 struct buffer_head *bh;
1086 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1087 if (unlikely(!bh)) {
1088 *errp = -ENOMEM;
1089 goto err;
1090 }
1091 if (buffer_new(&dummy)) {
1092 J_ASSERT(create != 0);
1093 J_ASSERT(handle != NULL);
1094
1095 /*
1096 * Now that we do not always journal data, we should
1097 * keep in mind whether this should always journal the
1098 * new buffer as metadata. For now, regular file
1099 * writes use ext3_get_block instead, so it's not a
1100 * problem.
1101 */
1102 lock_buffer(bh);
1103 BUFFER_TRACE(bh, "call get_create_access");
1104 fatal = ext3_journal_get_create_access(handle, bh);
1105 if (!fatal && !buffer_uptodate(bh)) {
1106 memset(bh->b_data,0,inode->i_sb->s_blocksize);
1107 set_buffer_uptodate(bh);
1108 }
1109 unlock_buffer(bh);
1110 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1111 err = ext3_journal_dirty_metadata(handle, bh);
1112 if (!fatal)
1113 fatal = err;
1114 } else {
1115 BUFFER_TRACE(bh, "not a new buffer");
1116 }
1117 if (fatal) {
1118 *errp = fatal;
1119 brelse(bh);
1120 bh = NULL;
1121 }
1122 return bh;
1123 }
1124err:
1125 return NULL;
1126}
1127
1128struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
1129 int block, int create, int *err)
1130{
1131 struct buffer_head * bh;
1132
1133 bh = ext3_getblk(handle, inode, block, create, err);
1134 if (!bh)
1135 return bh;
1136 if (bh_uptodate_or_lock(bh))
1137 return bh;
1138 get_bh(bh);
1139 bh->b_end_io = end_buffer_read_sync;
1140 submit_bh(READ | REQ_META | REQ_PRIO, bh);
1141 wait_on_buffer(bh);
1142 if (buffer_uptodate(bh))
1143 return bh;
1144 put_bh(bh);
1145 *err = -EIO;
1146 return NULL;
1147}
1148
1149static int walk_page_buffers( handle_t *handle,
1150 struct buffer_head *head,
1151 unsigned from,
1152 unsigned to,
1153 int *partial,
1154 int (*fn)( handle_t *handle,
1155 struct buffer_head *bh))
1156{
1157 struct buffer_head *bh;
1158 unsigned block_start, block_end;
1159 unsigned blocksize = head->b_size;
1160 int err, ret = 0;
1161 struct buffer_head *next;
1162
1163 for ( bh = head, block_start = 0;
1164 ret == 0 && (bh != head || !block_start);
1165 block_start = block_end, bh = next)
1166 {
1167 next = bh->b_this_page;
1168 block_end = block_start + blocksize;
1169 if (block_end <= from || block_start >= to) {
1170 if (partial && !buffer_uptodate(bh))
1171 *partial = 1;
1172 continue;
1173 }
1174 err = (*fn)(handle, bh);
1175 if (!ret)
1176 ret = err;
1177 }
1178 return ret;
1179}
1180
1181/*
1182 * To preserve ordering, it is essential that the hole instantiation and
1183 * the data write be encapsulated in a single transaction. We cannot
1184 * close off a transaction and start a new one between the ext3_get_block()
1185 * and the commit_write(). So doing the journal_start at the start of
1186 * prepare_write() is the right place.
1187 *
1188 * Also, this function can nest inside ext3_writepage() ->
1189 * block_write_full_page(). In that case, we *know* that ext3_writepage()
1190 * has generated enough buffer credits to do the whole page. So we won't
1191 * block on the journal in that case, which is good, because the caller may
1192 * be PF_MEMALLOC.
1193 *
1194 * By accident, ext3 can be reentered when a transaction is open via
1195 * quota file writes. If we were to commit the transaction while thus
1196 * reentered, there can be a deadlock - we would be holding a quota
1197 * lock, and the commit would never complete if another thread had a
1198 * transaction open and was blocking on the quota lock - a ranking
1199 * violation.
1200 *
1201 * So what we do is to rely on the fact that journal_stop/journal_start
1202 * will _not_ run commit under these circumstances because handle->h_ref
1203 * is elevated. We'll still have enough credits for the tiny quotafile
1204 * write.
1205 */
1206static int do_journal_get_write_access(handle_t *handle,
1207 struct buffer_head *bh)
1208{
1209 int dirty = buffer_dirty(bh);
1210 int ret;
1211
1212 if (!buffer_mapped(bh) || buffer_freed(bh))
1213 return 0;
1214 /*
1215 * __block_prepare_write() could have dirtied some buffers. Clean
1216 * the dirty bit as jbd2_journal_get_write_access() could complain
1217 * otherwise about fs integrity issues. Setting of the dirty bit
1218 * by __block_prepare_write() isn't a real problem here as we clear
1219 * the bit before releasing a page lock and thus writeback cannot
1220 * ever write the buffer.
1221 */
1222 if (dirty)
1223 clear_buffer_dirty(bh);
1224 ret = ext3_journal_get_write_access(handle, bh);
1225 if (!ret && dirty)
1226 ret = ext3_journal_dirty_metadata(handle, bh);
1227 return ret;
1228}
1229
1230/*
1231 * Truncate blocks that were not used by write. We have to truncate the
1232 * pagecache as well so that corresponding buffers get properly unmapped.
1233 */
1234static void ext3_truncate_failed_write(struct inode *inode)
1235{
1236 truncate_inode_pages(inode->i_mapping, inode->i_size);
1237 ext3_truncate(inode);
1238}
1239
1240/*
1241 * Truncate blocks that were not used by direct IO write. We have to zero out
1242 * the last file block as well because direct IO might have written to it.
1243 */
1244static void ext3_truncate_failed_direct_write(struct inode *inode)
1245{
1246 ext3_block_truncate_page(inode, inode->i_size);
1247 ext3_truncate(inode);
1248}
1249
1250static int ext3_write_begin(struct file *file, struct address_space *mapping,
1251 loff_t pos, unsigned len, unsigned flags,
1252 struct page **pagep, void **fsdata)
1253{
1254 struct inode *inode = mapping->host;
1255 int ret;
1256 handle_t *handle;
1257 int retries = 0;
1258 struct page *page;
1259 pgoff_t index;
1260 unsigned from, to;
1261 /* Reserve one block more for addition to orphan list in case
1262 * we allocate blocks but write fails for some reason */
1263 int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
1264
1265 trace_ext3_write_begin(inode, pos, len, flags);
1266
1267 index = pos >> PAGE_CACHE_SHIFT;
1268 from = pos & (PAGE_CACHE_SIZE - 1);
1269 to = from + len;
1270
1271retry:
1272 page = grab_cache_page_write_begin(mapping, index, flags);
1273 if (!page)
1274 return -ENOMEM;
1275 *pagep = page;
1276
1277 handle = ext3_journal_start(inode, needed_blocks);
1278 if (IS_ERR(handle)) {
1279 unlock_page(page);
1280 page_cache_release(page);
1281 ret = PTR_ERR(handle);
1282 goto out;
1283 }
1284 ret = __block_write_begin(page, pos, len, ext3_get_block);
1285 if (ret)
1286 goto write_begin_failed;
1287
1288 if (ext3_should_journal_data(inode)) {
1289 ret = walk_page_buffers(handle, page_buffers(page),
1290 from, to, NULL, do_journal_get_write_access);
1291 }
1292write_begin_failed:
1293 if (ret) {
1294 /*
1295 * block_write_begin may have instantiated a few blocks
1296 * outside i_size. Trim these off again. Don't need
1297 * i_size_read because we hold i_mutex.
1298 *
1299 * Add inode to orphan list in case we crash before truncate
1300 * finishes. Do this only if ext3_can_truncate() agrees so
1301 * that orphan processing code is happy.
1302 */
1303 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1304 ext3_orphan_add(handle, inode);
1305 ext3_journal_stop(handle);
1306 unlock_page(page);
1307 page_cache_release(page);
1308 if (pos + len > inode->i_size)
1309 ext3_truncate_failed_write(inode);
1310 }
1311 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1312 goto retry;
1313out:
1314 return ret;
1315}
1316
1317
1318int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1319{
1320 int err = journal_dirty_data(handle, bh);
1321 if (err)
1322 ext3_journal_abort_handle(__func__, __func__,
1323 bh, handle, err);
1324 return err;
1325}
1326
1327/* For ordered writepage and write_end functions */
1328static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1329{
1330 /*
1331 * Write could have mapped the buffer but it didn't copy the data in
1332 * yet. So avoid filing such buffer into a transaction.
1333 */
1334 if (buffer_mapped(bh) && buffer_uptodate(bh))
1335 return ext3_journal_dirty_data(handle, bh);
1336 return 0;
1337}
1338
1339/* For write_end() in data=journal mode */
1340static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1341{
1342 if (!buffer_mapped(bh) || buffer_freed(bh))
1343 return 0;
1344 set_buffer_uptodate(bh);
1345 return ext3_journal_dirty_metadata(handle, bh);
1346}
1347
1348/*
1349 * This is nasty and subtle: ext3_write_begin() could have allocated blocks
1350 * for the whole page but later we failed to copy the data in. Update inode
1351 * size according to what we managed to copy. The rest is going to be
1352 * truncated in write_end function.
1353 */
1354static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
1355{
1356 /* What matters to us is i_disksize. We don't write i_size anywhere */
1357 if (pos + copied > inode->i_size)
1358 i_size_write(inode, pos + copied);
1359 if (pos + copied > EXT3_I(inode)->i_disksize) {
1360 EXT3_I(inode)->i_disksize = pos + copied;
1361 mark_inode_dirty(inode);
1362 }
1363}
1364
1365/*
1366 * We need to pick up the new inode size which generic_commit_write gave us
1367 * `file' can be NULL - eg, when called from page_symlink().
1368 *
1369 * ext3 never places buffers on inode->i_mapping->private_list. metadata
1370 * buffers are managed internally.
1371 */
1372static int ext3_ordered_write_end(struct file *file,
1373 struct address_space *mapping,
1374 loff_t pos, unsigned len, unsigned copied,
1375 struct page *page, void *fsdata)
1376{
1377 handle_t *handle = ext3_journal_current_handle();
1378 struct inode *inode = file->f_mapping->host;
1379 unsigned from, to;
1380 int ret = 0, ret2;
1381
1382 trace_ext3_ordered_write_end(inode, pos, len, copied);
1383 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1384
1385 from = pos & (PAGE_CACHE_SIZE - 1);
1386 to = from + copied;
1387 ret = walk_page_buffers(handle, page_buffers(page),
1388 from, to, NULL, journal_dirty_data_fn);
1389
1390 if (ret == 0)
1391 update_file_sizes(inode, pos, copied);
1392 /*
1393 * There may be allocated blocks outside of i_size because
1394 * we failed to copy some data. Prepare for truncate.
1395 */
1396 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1397 ext3_orphan_add(handle, inode);
1398 ret2 = ext3_journal_stop(handle);
1399 if (!ret)
1400 ret = ret2;
1401 unlock_page(page);
1402 page_cache_release(page);
1403
1404 if (pos + len > inode->i_size)
1405 ext3_truncate_failed_write(inode);
1406 return ret ? ret : copied;
1407}
1408
1409static int ext3_writeback_write_end(struct file *file,
1410 struct address_space *mapping,
1411 loff_t pos, unsigned len, unsigned copied,
1412 struct page *page, void *fsdata)
1413{
1414 handle_t *handle = ext3_journal_current_handle();
1415 struct inode *inode = file->f_mapping->host;
1416 int ret;
1417
1418 trace_ext3_writeback_write_end(inode, pos, len, copied);
1419 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1420 update_file_sizes(inode, pos, copied);
1421 /*
1422 * There may be allocated blocks outside of i_size because
1423 * we failed to copy some data. Prepare for truncate.
1424 */
1425 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1426 ext3_orphan_add(handle, inode);
1427 ret = ext3_journal_stop(handle);
1428 unlock_page(page);
1429 page_cache_release(page);
1430
1431 if (pos + len > inode->i_size)
1432 ext3_truncate_failed_write(inode);
1433 return ret ? ret : copied;
1434}
1435
1436static int ext3_journalled_write_end(struct file *file,
1437 struct address_space *mapping,
1438 loff_t pos, unsigned len, unsigned copied,
1439 struct page *page, void *fsdata)
1440{
1441 handle_t *handle = ext3_journal_current_handle();
1442 struct inode *inode = mapping->host;
1443 struct ext3_inode_info *ei = EXT3_I(inode);
1444 int ret = 0, ret2;
1445 int partial = 0;
1446 unsigned from, to;
1447
1448 trace_ext3_journalled_write_end(inode, pos, len, copied);
1449 from = pos & (PAGE_CACHE_SIZE - 1);
1450 to = from + len;
1451
1452 if (copied < len) {
1453 if (!PageUptodate(page))
1454 copied = 0;
1455 page_zero_new_buffers(page, from + copied, to);
1456 to = from + copied;
1457 }
1458
1459 ret = walk_page_buffers(handle, page_buffers(page), from,
1460 to, &partial, write_end_fn);
1461 if (!partial)
1462 SetPageUptodate(page);
1463
1464 if (pos + copied > inode->i_size)
1465 i_size_write(inode, pos + copied);
1466 /*
1467 * There may be allocated blocks outside of i_size because
1468 * we failed to copy some data. Prepare for truncate.
1469 */
1470 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1471 ext3_orphan_add(handle, inode);
1472 ext3_set_inode_state(inode, EXT3_STATE_JDATA);
1473 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
1474 if (inode->i_size > ei->i_disksize) {
1475 ei->i_disksize = inode->i_size;
1476 ret2 = ext3_mark_inode_dirty(handle, inode);
1477 if (!ret)
1478 ret = ret2;
1479 }
1480
1481 ret2 = ext3_journal_stop(handle);
1482 if (!ret)
1483 ret = ret2;
1484 unlock_page(page);
1485 page_cache_release(page);
1486
1487 if (pos + len > inode->i_size)
1488 ext3_truncate_failed_write(inode);
1489 return ret ? ret : copied;
1490}
1491
1492/*
1493 * bmap() is special. It gets used by applications such as lilo and by
1494 * the swapper to find the on-disk block of a specific piece of data.
1495 *
1496 * Naturally, this is dangerous if the block concerned is still in the
1497 * journal. If somebody makes a swapfile on an ext3 data-journaling
1498 * filesystem and enables swap, then they may get a nasty shock when the
1499 * data getting swapped to that swapfile suddenly gets overwritten by
1500 * the original zero's written out previously to the journal and
1501 * awaiting writeback in the kernel's buffer cache.
1502 *
1503 * So, if we see any bmap calls here on a modified, data-journaled file,
1504 * take extra steps to flush any blocks which might be in the cache.
1505 */
1506static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1507{
1508 struct inode *inode = mapping->host;
1509 journal_t *journal;
1510 int err;
1511
1512 if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
1513 /*
1514 * This is a REALLY heavyweight approach, but the use of
1515 * bmap on dirty files is expected to be extremely rare:
1516 * only if we run lilo or swapon on a freshly made file
1517 * do we expect this to happen.
1518 *
1519 * (bmap requires CAP_SYS_RAWIO so this does not
1520 * represent an unprivileged user DOS attack --- we'd be
1521 * in trouble if mortal users could trigger this path at
1522 * will.)
1523 *
1524 * NB. EXT3_STATE_JDATA is not set on files other than
1525 * regular files. If somebody wants to bmap a directory
1526 * or symlink and gets confused because the buffer
1527 * hasn't yet been flushed to disk, they deserve
1528 * everything they get.
1529 */
1530
1531 ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
1532 journal = EXT3_JOURNAL(inode);
1533 journal_lock_updates(journal);
1534 err = journal_flush(journal);
1535 journal_unlock_updates(journal);
1536
1537 if (err)
1538 return 0;
1539 }
1540
1541 return generic_block_bmap(mapping,block,ext3_get_block);
1542}
1543
1544static int bget_one(handle_t *handle, struct buffer_head *bh)
1545{
1546 get_bh(bh);
1547 return 0;
1548}
1549
1550static int bput_one(handle_t *handle, struct buffer_head *bh)
1551{
1552 put_bh(bh);
1553 return 0;
1554}
1555
1556static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
1557{
1558 return !buffer_mapped(bh);
1559}
1560
1561/*
1562 * Note that whenever we need to map blocks we start a transaction even if
1563 * we're not journalling data. This is to preserve ordering: any hole
1564 * instantiation within __block_write_full_page -> ext3_get_block() should be
1565 * journalled along with the data so we don't crash and then get metadata which
1566 * refers to old data.
1567 *
1568 * In all journalling modes block_write_full_page() will start the I/O.
1569 *
1570 * We don't honour synchronous mounts for writepage(). That would be
1571 * disastrous. Any write() or metadata operation will sync the fs for
1572 * us.
1573 */
1574static int ext3_ordered_writepage(struct page *page,
1575 struct writeback_control *wbc)
1576{
1577 struct inode *inode = page->mapping->host;
1578 struct buffer_head *page_bufs;
1579 handle_t *handle = NULL;
1580 int ret = 0;
1581 int err;
1582
1583 J_ASSERT(PageLocked(page));
1584 /*
1585 * We don't want to warn for emergency remount. The condition is
1586 * ordered to avoid dereferencing inode->i_sb in non-error case to
1587 * avoid slow-downs.
1588 */
1589 WARN_ON_ONCE(IS_RDONLY(inode) &&
1590 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
1591
1592 /*
1593 * We give up here if we're reentered, because it might be for a
1594 * different filesystem.
1595 */
1596 if (ext3_journal_current_handle())
1597 goto out_fail;
1598
1599 trace_ext3_ordered_writepage(page);
1600 if (!page_has_buffers(page)) {
1601 create_empty_buffers(page, inode->i_sb->s_blocksize,
1602 (1 << BH_Dirty)|(1 << BH_Uptodate));
1603 page_bufs = page_buffers(page);
1604 } else {
1605 page_bufs = page_buffers(page);
1606 if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
1607 NULL, buffer_unmapped)) {
1608 /* Provide NULL get_block() to catch bugs if buffers
1609 * weren't really mapped */
1610 return block_write_full_page(page, NULL, wbc);
1611 }
1612 }
1613 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1614
1615 if (IS_ERR(handle)) {
1616 ret = PTR_ERR(handle);
1617 goto out_fail;
1618 }
1619
1620 walk_page_buffers(handle, page_bufs, 0,
1621 PAGE_CACHE_SIZE, NULL, bget_one);
1622
1623 ret = block_write_full_page(page, ext3_get_block, wbc);
1624
1625 /*
1626 * The page can become unlocked at any point now, and
1627 * truncate can then come in and change things. So we
1628 * can't touch *page from now on. But *page_bufs is
1629 * safe due to elevated refcount.
1630 */
1631
1632 /*
1633 * And attach them to the current transaction. But only if
1634 * block_write_full_page() succeeded. Otherwise they are unmapped,
1635 * and generally junk.
1636 */
1637 if (ret == 0)
1638 ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1639 NULL, journal_dirty_data_fn);
1640 walk_page_buffers(handle, page_bufs, 0,
1641 PAGE_CACHE_SIZE, NULL, bput_one);
1642 err = ext3_journal_stop(handle);
1643 if (!ret)
1644 ret = err;
1645 return ret;
1646
1647out_fail:
1648 redirty_page_for_writepage(wbc, page);
1649 unlock_page(page);
1650 return ret;
1651}
1652
1653static int ext3_writeback_writepage(struct page *page,
1654 struct writeback_control *wbc)
1655{
1656 struct inode *inode = page->mapping->host;
1657 handle_t *handle = NULL;
1658 int ret = 0;
1659 int err;
1660
1661 J_ASSERT(PageLocked(page));
1662 /*
1663 * We don't want to warn for emergency remount. The condition is
1664 * ordered to avoid dereferencing inode->i_sb in non-error case to
1665 * avoid slow-downs.
1666 */
1667 WARN_ON_ONCE(IS_RDONLY(inode) &&
1668 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
1669
1670 if (ext3_journal_current_handle())
1671 goto out_fail;
1672
1673 trace_ext3_writeback_writepage(page);
1674 if (page_has_buffers(page)) {
1675 if (!walk_page_buffers(NULL, page_buffers(page), 0,
1676 PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
1677 /* Provide NULL get_block() to catch bugs if buffers
1678 * weren't really mapped */
1679 return block_write_full_page(page, NULL, wbc);
1680 }
1681 }
1682
1683 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1684 if (IS_ERR(handle)) {
1685 ret = PTR_ERR(handle);
1686 goto out_fail;
1687 }
1688
1689 ret = block_write_full_page(page, ext3_get_block, wbc);
1690
1691 err = ext3_journal_stop(handle);
1692 if (!ret)
1693 ret = err;
1694 return ret;
1695
1696out_fail:
1697 redirty_page_for_writepage(wbc, page);
1698 unlock_page(page);
1699 return ret;
1700}
1701
1702static int ext3_journalled_writepage(struct page *page,
1703 struct writeback_control *wbc)
1704{
1705 struct inode *inode = page->mapping->host;
1706 handle_t *handle = NULL;
1707 int ret = 0;
1708 int err;
1709
1710 J_ASSERT(PageLocked(page));
1711 /*
1712 * We don't want to warn for emergency remount. The condition is
1713 * ordered to avoid dereferencing inode->i_sb in non-error case to
1714 * avoid slow-downs.
1715 */
1716 WARN_ON_ONCE(IS_RDONLY(inode) &&
1717 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
1718
1719 trace_ext3_journalled_writepage(page);
1720 if (!page_has_buffers(page) || PageChecked(page)) {
1721 if (ext3_journal_current_handle())
1722 goto no_write;
1723
1724 handle = ext3_journal_start(inode,
1725 ext3_writepage_trans_blocks(inode));
1726 if (IS_ERR(handle)) {
1727 ret = PTR_ERR(handle);
1728 goto no_write;
1729 }
1730 /*
1731 * It's mmapped pagecache. Add buffers and journal it. There
1732 * doesn't seem much point in redirtying the page here.
1733 */
1734 ClearPageChecked(page);
1735 ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
1736 ext3_get_block);
1737 if (ret != 0) {
1738 ext3_journal_stop(handle);
1739 goto out_unlock;
1740 }
1741 ret = walk_page_buffers(handle, page_buffers(page), 0,
1742 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1743
1744 err = walk_page_buffers(handle, page_buffers(page), 0,
1745 PAGE_CACHE_SIZE, NULL, write_end_fn);
1746 if (ret == 0)
1747 ret = err;
1748 ext3_set_inode_state(inode, EXT3_STATE_JDATA);
1749 atomic_set(&EXT3_I(inode)->i_datasync_tid,
1750 handle->h_transaction->t_tid);
1751 unlock_page(page);
1752 err = ext3_journal_stop(handle);
1753 if (!ret)
1754 ret = err;
1755 } else {
1756 /*
1757 * It is a page full of checkpoint-mode buffers. Go and write
1758 * them. They should have been already mapped when they went
1759 * to the journal so provide NULL get_block function to catch
1760 * errors.
1761 */
1762 ret = block_write_full_page(page, NULL, wbc);
1763 }
1764out:
1765 return ret;
1766
1767no_write:
1768 redirty_page_for_writepage(wbc, page);
1769out_unlock:
1770 unlock_page(page);
1771 goto out;
1772}
1773
1774static int ext3_readpage(struct file *file, struct page *page)
1775{
1776 trace_ext3_readpage(page);
1777 return mpage_readpage(page, ext3_get_block);
1778}
1779
1780static int
1781ext3_readpages(struct file *file, struct address_space *mapping,
1782 struct list_head *pages, unsigned nr_pages)
1783{
1784 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1785}
1786
1787static void ext3_invalidatepage(struct page *page, unsigned int offset,
1788 unsigned int length)
1789{
1790 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1791
1792 trace_ext3_invalidatepage(page, offset, length);
1793
1794 /*
1795 * If it's a full truncate we just forget about the pending dirtying
1796 */
1797 if (offset == 0 && length == PAGE_CACHE_SIZE)
1798 ClearPageChecked(page);
1799
1800 journal_invalidatepage(journal, page, offset, length);
1801}
1802
1803static int ext3_releasepage(struct page *page, gfp_t wait)
1804{
1805 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1806
1807 trace_ext3_releasepage(page);
1808 WARN_ON(PageChecked(page));
1809 if (!page_has_buffers(page))
1810 return 0;
1811 return journal_try_to_free_buffers(journal, page, wait);
1812}
1813
1814/*
1815 * If the O_DIRECT write will extend the file then add this inode to the
1816 * orphan list. So recovery will truncate it back to the original size
1817 * if the machine crashes during the write.
1818 *
1819 * If the O_DIRECT write is intantiating holes inside i_size and the machine
1820 * crashes then stale disk data _may_ be exposed inside the file. But current
1821 * VFS code falls back into buffered path in that case so we are safe.
1822 */
1823static ssize_t ext3_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
1824 loff_t offset)
1825{
1826 struct file *file = iocb->ki_filp;
1827 struct inode *inode = file->f_mapping->host;
1828 struct ext3_inode_info *ei = EXT3_I(inode);
1829 handle_t *handle;
1830 ssize_t ret;
1831 int orphan = 0;
1832 size_t count = iov_iter_count(iter);
1833 int retries = 0;
1834
1835 trace_ext3_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
1836
1837 if (iov_iter_rw(iter) == WRITE) {
1838 loff_t final_size = offset + count;
1839
1840 if (final_size > inode->i_size) {
1841 /* Credits for sb + inode write */
1842 handle = ext3_journal_start(inode, 2);
1843 if (IS_ERR(handle)) {
1844 ret = PTR_ERR(handle);
1845 goto out;
1846 }
1847 ret = ext3_orphan_add(handle, inode);
1848 if (ret) {
1849 ext3_journal_stop(handle);
1850 goto out;
1851 }
1852 orphan = 1;
1853 ei->i_disksize = inode->i_size;
1854 ext3_journal_stop(handle);
1855 }
1856 }
1857
1858retry:
1859 ret = blockdev_direct_IO(iocb, inode, iter, offset, ext3_get_block);
1860 /*
1861 * In case of error extending write may have instantiated a few
1862 * blocks outside i_size. Trim these off again.
1863 */
1864 if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
1865 loff_t isize = i_size_read(inode);
1866 loff_t end = offset + count;
1867
1868 if (end > isize)
1869 ext3_truncate_failed_direct_write(inode);
1870 }
1871 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1872 goto retry;
1873
1874 if (orphan) {
1875 int err;
1876
1877 /* Credits for sb + inode write */
1878 handle = ext3_journal_start(inode, 2);
1879 if (IS_ERR(handle)) {
1880 /* This is really bad luck. We've written the data
1881 * but cannot extend i_size. Truncate allocated blocks
1882 * and pretend the write failed... */
1883 ext3_truncate_failed_direct_write(inode);
1884 ret = PTR_ERR(handle);
1885 if (inode->i_nlink)
1886 ext3_orphan_del(NULL, inode);
1887 goto out;
1888 }
1889 if (inode->i_nlink)
1890 ext3_orphan_del(handle, inode);
1891 if (ret > 0) {
1892 loff_t end = offset + ret;
1893 if (end > inode->i_size) {
1894 ei->i_disksize = end;
1895 i_size_write(inode, end);
1896 /*
1897 * We're going to return a positive `ret'
1898 * here due to non-zero-length I/O, so there's
1899 * no way of reporting error returns from
1900 * ext3_mark_inode_dirty() to userspace. So
1901 * ignore it.
1902 */
1903 ext3_mark_inode_dirty(handle, inode);
1904 }
1905 }
1906 err = ext3_journal_stop(handle);
1907 if (ret == 0)
1908 ret = err;
1909 }
1910out:
1911 trace_ext3_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
1912 return ret;
1913}
1914
1915/*
1916 * Pages can be marked dirty completely asynchronously from ext3's journalling
1917 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
1918 * much here because ->set_page_dirty is called under VFS locks. The page is
1919 * not necessarily locked.
1920 *
1921 * We cannot just dirty the page and leave attached buffers clean, because the
1922 * buffers' dirty state is "definitive". We cannot just set the buffers dirty
1923 * or jbddirty because all the journalling code will explode.
1924 *
1925 * So what we do is to mark the page "pending dirty" and next time writepage
1926 * is called, propagate that into the buffers appropriately.
1927 */
1928static int ext3_journalled_set_page_dirty(struct page *page)
1929{
1930 SetPageChecked(page);
1931 return __set_page_dirty_nobuffers(page);
1932}
1933
1934static const struct address_space_operations ext3_ordered_aops = {
1935 .readpage = ext3_readpage,
1936 .readpages = ext3_readpages,
1937 .writepage = ext3_ordered_writepage,
1938 .write_begin = ext3_write_begin,
1939 .write_end = ext3_ordered_write_end,
1940 .bmap = ext3_bmap,
1941 .invalidatepage = ext3_invalidatepage,
1942 .releasepage = ext3_releasepage,
1943 .direct_IO = ext3_direct_IO,
1944 .migratepage = buffer_migrate_page,
1945 .is_partially_uptodate = block_is_partially_uptodate,
1946 .is_dirty_writeback = buffer_check_dirty_writeback,
1947 .error_remove_page = generic_error_remove_page,
1948};
1949
1950static const struct address_space_operations ext3_writeback_aops = {
1951 .readpage = ext3_readpage,
1952 .readpages = ext3_readpages,
1953 .writepage = ext3_writeback_writepage,
1954 .write_begin = ext3_write_begin,
1955 .write_end = ext3_writeback_write_end,
1956 .bmap = ext3_bmap,
1957 .invalidatepage = ext3_invalidatepage,
1958 .releasepage = ext3_releasepage,
1959 .direct_IO = ext3_direct_IO,
1960 .migratepage = buffer_migrate_page,
1961 .is_partially_uptodate = block_is_partially_uptodate,
1962 .error_remove_page = generic_error_remove_page,
1963};
1964
1965static const struct address_space_operations ext3_journalled_aops = {
1966 .readpage = ext3_readpage,
1967 .readpages = ext3_readpages,
1968 .writepage = ext3_journalled_writepage,
1969 .write_begin = ext3_write_begin,
1970 .write_end = ext3_journalled_write_end,
1971 .set_page_dirty = ext3_journalled_set_page_dirty,
1972 .bmap = ext3_bmap,
1973 .invalidatepage = ext3_invalidatepage,
1974 .releasepage = ext3_releasepage,
1975 .is_partially_uptodate = block_is_partially_uptodate,
1976 .error_remove_page = generic_error_remove_page,
1977};
1978
1979void ext3_set_aops(struct inode *inode)
1980{
1981 if (ext3_should_order_data(inode))
1982 inode->i_mapping->a_ops = &ext3_ordered_aops;
1983 else if (ext3_should_writeback_data(inode))
1984 inode->i_mapping->a_ops = &ext3_writeback_aops;
1985 else
1986 inode->i_mapping->a_ops = &ext3_journalled_aops;
1987}
1988
1989/*
1990 * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1991 * up to the end of the block which corresponds to `from'.
1992 * This required during truncate. We need to physically zero the tail end
1993 * of that block so it doesn't yield old data if the file is later grown.
1994 */
1995static int ext3_block_truncate_page(struct inode *inode, loff_t from)
1996{
1997 ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
1998 unsigned offset = from & (PAGE_CACHE_SIZE - 1);
1999 unsigned blocksize, iblock, length, pos;
2000 struct page *page;
2001 handle_t *handle = NULL;
2002 struct buffer_head *bh;
2003 int err = 0;
2004
2005 /* Truncated on block boundary - nothing to do */
2006 blocksize = inode->i_sb->s_blocksize;
2007 if ((from & (blocksize - 1)) == 0)
2008 return 0;
2009
2010 page = grab_cache_page(inode->i_mapping, index);
2011 if (!page)
2012 return -ENOMEM;
2013 length = blocksize - (offset & (blocksize - 1));
2014 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
2015
2016 if (!page_has_buffers(page))
2017 create_empty_buffers(page, blocksize, 0);
2018
2019 /* Find the buffer that contains "offset" */
2020 bh = page_buffers(page);
2021 pos = blocksize;
2022 while (offset >= pos) {
2023 bh = bh->b_this_page;
2024 iblock++;
2025 pos += blocksize;
2026 }
2027
2028 err = 0;
2029 if (buffer_freed(bh)) {
2030 BUFFER_TRACE(bh, "freed: skip");
2031 goto unlock;
2032 }
2033
2034 if (!buffer_mapped(bh)) {
2035 BUFFER_TRACE(bh, "unmapped");
2036 ext3_get_block(inode, iblock, bh, 0);
2037 /* unmapped? It's a hole - nothing to do */
2038 if (!buffer_mapped(bh)) {
2039 BUFFER_TRACE(bh, "still unmapped");
2040 goto unlock;
2041 }
2042 }
2043
2044 /* Ok, it's mapped. Make sure it's up-to-date */
2045 if (PageUptodate(page))
2046 set_buffer_uptodate(bh);
2047
2048 if (!bh_uptodate_or_lock(bh)) {
2049 err = bh_submit_read(bh);
2050 /* Uhhuh. Read error. Complain and punt. */
2051 if (err)
2052 goto unlock;
2053 }
2054
2055 /* data=writeback mode doesn't need transaction to zero-out data */
2056 if (!ext3_should_writeback_data(inode)) {
2057 /* We journal at most one block */
2058 handle = ext3_journal_start(inode, 1);
2059 if (IS_ERR(handle)) {
2060 clear_highpage(page);
2061 flush_dcache_page(page);
2062 err = PTR_ERR(handle);
2063 goto unlock;
2064 }
2065 }
2066
2067 if (ext3_should_journal_data(inode)) {
2068 BUFFER_TRACE(bh, "get write access");
2069 err = ext3_journal_get_write_access(handle, bh);
2070 if (err)
2071 goto stop;
2072 }
2073
2074 zero_user(page, offset, length);
2075 BUFFER_TRACE(bh, "zeroed end of block");
2076
2077 err = 0;
2078 if (ext3_should_journal_data(inode)) {
2079 err = ext3_journal_dirty_metadata(handle, bh);
2080 } else {
2081 if (ext3_should_order_data(inode))
2082 err = ext3_journal_dirty_data(handle, bh);
2083 mark_buffer_dirty(bh);
2084 }
2085stop:
2086 if (handle)
2087 ext3_journal_stop(handle);
2088
2089unlock:
2090 unlock_page(page);
2091 page_cache_release(page);
2092 return err;
2093}
2094
2095/*
2096 * Probably it should be a library function... search for first non-zero word
2097 * or memcmp with zero_page, whatever is better for particular architecture.
2098 * Linus?
2099 */
2100static inline int all_zeroes(__le32 *p, __le32 *q)
2101{
2102 while (p < q)
2103 if (*p++)
2104 return 0;
2105 return 1;
2106}
2107
2108/**
2109 * ext3_find_shared - find the indirect blocks for partial truncation.
2110 * @inode: inode in question
2111 * @depth: depth of the affected branch
2112 * @offsets: offsets of pointers in that branch (see ext3_block_to_path)
2113 * @chain: place to store the pointers to partial indirect blocks
2114 * @top: place to the (detached) top of branch
2115 *
2116 * This is a helper function used by ext3_truncate().
2117 *
2118 * When we do truncate() we may have to clean the ends of several
2119 * indirect blocks but leave the blocks themselves alive. Block is
2120 * partially truncated if some data below the new i_size is referred
2121 * from it (and it is on the path to the first completely truncated
2122 * data block, indeed). We have to free the top of that path along
2123 * with everything to the right of the path. Since no allocation
2124 * past the truncation point is possible until ext3_truncate()
2125 * finishes, we may safely do the latter, but top of branch may
2126 * require special attention - pageout below the truncation point
2127 * might try to populate it.
2128 *
2129 * We atomically detach the top of branch from the tree, store the
2130 * block number of its root in *@top, pointers to buffer_heads of
2131 * partially truncated blocks - in @chain[].bh and pointers to
2132 * their last elements that should not be removed - in
2133 * @chain[].p. Return value is the pointer to last filled element
2134 * of @chain.
2135 *
2136 * The work left to caller to do the actual freeing of subtrees:
2137 * a) free the subtree starting from *@top
2138 * b) free the subtrees whose roots are stored in
2139 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
2140 * c) free the subtrees growing from the inode past the @chain[0].
2141 * (no partially truncated stuff there). */
2142
2143static Indirect *ext3_find_shared(struct inode *inode, int depth,
2144 int offsets[4], Indirect chain[4], __le32 *top)
2145{
2146 Indirect *partial, *p;
2147 int k, err;
2148
2149 *top = 0;
2150 /* Make k index the deepest non-null offset + 1 */
2151 for (k = depth; k > 1 && !offsets[k-1]; k--)
2152 ;
2153 partial = ext3_get_branch(inode, k, offsets, chain, &err);
2154 /* Writer: pointers */
2155 if (!partial)
2156 partial = chain + k-1;
2157 /*
2158 * If the branch acquired continuation since we've looked at it -
2159 * fine, it should all survive and (new) top doesn't belong to us.
2160 */
2161 if (!partial->key && *partial->p)
2162 /* Writer: end */
2163 goto no_top;
2164 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
2165 ;
2166 /*
2167 * OK, we've found the last block that must survive. The rest of our
2168 * branch should be detached before unlocking. However, if that rest
2169 * of branch is all ours and does not grow immediately from the inode
2170 * it's easier to cheat and just decrement partial->p.
2171 */
2172 if (p == chain + k - 1 && p > chain) {
2173 p->p--;
2174 } else {
2175 *top = *p->p;
2176 /* Nope, don't do this in ext3. Must leave the tree intact */
2177#if 0
2178 *p->p = 0;
2179#endif
2180 }
2181 /* Writer: end */
2182
2183 while(partial > p) {
2184 brelse(partial->bh);
2185 partial--;
2186 }
2187no_top:
2188 return partial;
2189}
2190
2191/*
2192 * Zero a number of block pointers in either an inode or an indirect block.
2193 * If we restart the transaction we must again get write access to the
2194 * indirect block for further modification.
2195 *
2196 * We release `count' blocks on disk, but (last - first) may be greater
2197 * than `count' because there can be holes in there.
2198 */
2199static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
2200 struct buffer_head *bh, ext3_fsblk_t block_to_free,
2201 unsigned long count, __le32 *first, __le32 *last)
2202{
2203 __le32 *p;
2204 if (try_to_extend_transaction(handle, inode)) {
2205 if (bh) {
2206 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2207 if (ext3_journal_dirty_metadata(handle, bh))
2208 return;
2209 }
2210 ext3_mark_inode_dirty(handle, inode);
2211 truncate_restart_transaction(handle, inode);
2212 if (bh) {
2213 BUFFER_TRACE(bh, "retaking write access");
2214 if (ext3_journal_get_write_access(handle, bh))
2215 return;
2216 }
2217 }
2218
2219 /*
2220 * Any buffers which are on the journal will be in memory. We find
2221 * them on the hash table so journal_revoke() will run journal_forget()
2222 * on them. We've already detached each block from the file, so
2223 * bforget() in journal_forget() should be safe.
2224 *
2225 * AKPM: turn on bforget in journal_forget()!!!
2226 */
2227 for (p = first; p < last; p++) {
2228 u32 nr = le32_to_cpu(*p);
2229 if (nr) {
2230 struct buffer_head *bh;
2231
2232 *p = 0;
2233 bh = sb_find_get_block(inode->i_sb, nr);
2234 ext3_forget(handle, 0, inode, bh, nr);
2235 }
2236 }
2237
2238 ext3_free_blocks(handle, inode, block_to_free, count);
2239}
2240
2241/**
2242 * ext3_free_data - free a list of data blocks
2243 * @handle: handle for this transaction
2244 * @inode: inode we are dealing with
2245 * @this_bh: indirect buffer_head which contains *@first and *@last
2246 * @first: array of block numbers
2247 * @last: points immediately past the end of array
2248 *
2249 * We are freeing all blocks referred from that array (numbers are stored as
2250 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
2251 *
2252 * We accumulate contiguous runs of blocks to free. Conveniently, if these
2253 * blocks are contiguous then releasing them at one time will only affect one
2254 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
2255 * actually use a lot of journal space.
2256 *
2257 * @this_bh will be %NULL if @first and @last point into the inode's direct
2258 * block pointers.
2259 */
2260static void ext3_free_data(handle_t *handle, struct inode *inode,
2261 struct buffer_head *this_bh,
2262 __le32 *first, __le32 *last)
2263{
2264 ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */
2265 unsigned long count = 0; /* Number of blocks in the run */
2266 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
2267 corresponding to
2268 block_to_free */
2269 ext3_fsblk_t nr; /* Current block # */
2270 __le32 *p; /* Pointer into inode/ind
2271 for current block */
2272 int err;
2273
2274 if (this_bh) { /* For indirect block */
2275 BUFFER_TRACE(this_bh, "get_write_access");
2276 err = ext3_journal_get_write_access(handle, this_bh);
2277 /* Important: if we can't update the indirect pointers
2278 * to the blocks, we can't free them. */
2279 if (err)
2280 return;
2281 }
2282
2283 for (p = first; p < last; p++) {
2284 nr = le32_to_cpu(*p);
2285 if (nr) {
2286 /* accumulate blocks to free if they're contiguous */
2287 if (count == 0) {
2288 block_to_free = nr;
2289 block_to_free_p = p;
2290 count = 1;
2291 } else if (nr == block_to_free + count) {
2292 count++;
2293 } else {
2294 ext3_clear_blocks(handle, inode, this_bh,
2295 block_to_free,
2296 count, block_to_free_p, p);
2297 block_to_free = nr;
2298 block_to_free_p = p;
2299 count = 1;
2300 }
2301 }
2302 }
2303
2304 if (count > 0)
2305 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
2306 count, block_to_free_p, p);
2307
2308 if (this_bh) {
2309 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
2310
2311 /*
2312 * The buffer head should have an attached journal head at this
2313 * point. However, if the data is corrupted and an indirect
2314 * block pointed to itself, it would have been detached when
2315 * the block was cleared. Check for this instead of OOPSing.
2316 */
2317 if (bh2jh(this_bh))
2318 ext3_journal_dirty_metadata(handle, this_bh);
2319 else
2320 ext3_error(inode->i_sb, "ext3_free_data",
2321 "circular indirect block detected, "
2322 "inode=%lu, block=%llu",
2323 inode->i_ino,
2324 (unsigned long long)this_bh->b_blocknr);
2325 }
2326}
2327
2328/**
2329 * ext3_free_branches - free an array of branches
2330 * @handle: JBD handle for this transaction
2331 * @inode: inode we are dealing with
2332 * @parent_bh: the buffer_head which contains *@first and *@last
2333 * @first: array of block numbers
2334 * @last: pointer immediately past the end of array
2335 * @depth: depth of the branches to free
2336 *
2337 * We are freeing all blocks referred from these branches (numbers are
2338 * stored as little-endian 32-bit) and updating @inode->i_blocks
2339 * appropriately.
2340 */
2341static void ext3_free_branches(handle_t *handle, struct inode *inode,
2342 struct buffer_head *parent_bh,
2343 __le32 *first, __le32 *last, int depth)
2344{
2345 ext3_fsblk_t nr;
2346 __le32 *p;
2347
2348 if (is_handle_aborted(handle))
2349 return;
2350
2351 if (depth--) {
2352 struct buffer_head *bh;
2353 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2354 p = last;
2355 while (--p >= first) {
2356 nr = le32_to_cpu(*p);
2357 if (!nr)
2358 continue; /* A hole */
2359
2360 /* Go read the buffer for the next level down */
2361 bh = sb_bread(inode->i_sb, nr);
2362
2363 /*
2364 * A read failure? Report error and clear slot
2365 * (should be rare).
2366 */
2367 if (!bh) {
2368 ext3_error(inode->i_sb, "ext3_free_branches",
2369 "Read failure, inode=%lu, block="E3FSBLK,
2370 inode->i_ino, nr);
2371 continue;
2372 }
2373
2374 /* This zaps the entire block. Bottom up. */
2375 BUFFER_TRACE(bh, "free child branches");
2376 ext3_free_branches(handle, inode, bh,
2377 (__le32*)bh->b_data,
2378 (__le32*)bh->b_data + addr_per_block,
2379 depth);
2380
2381 /*
2382 * Everything below this this pointer has been
2383 * released. Now let this top-of-subtree go.
2384 *
2385 * We want the freeing of this indirect block to be
2386 * atomic in the journal with the updating of the
2387 * bitmap block which owns it. So make some room in
2388 * the journal.
2389 *
2390 * We zero the parent pointer *after* freeing its
2391 * pointee in the bitmaps, so if extend_transaction()
2392 * for some reason fails to put the bitmap changes and
2393 * the release into the same transaction, recovery
2394 * will merely complain about releasing a free block,
2395 * rather than leaking blocks.
2396 */
2397 if (is_handle_aborted(handle))
2398 return;
2399 if (try_to_extend_transaction(handle, inode)) {
2400 ext3_mark_inode_dirty(handle, inode);
2401 truncate_restart_transaction(handle, inode);
2402 }
2403
2404 /*
2405 * We've probably journalled the indirect block several
2406 * times during the truncate. But it's no longer
2407 * needed and we now drop it from the transaction via
2408 * journal_revoke().
2409 *
2410 * That's easy if it's exclusively part of this
2411 * transaction. But if it's part of the committing
2412 * transaction then journal_forget() will simply
2413 * brelse() it. That means that if the underlying
2414 * block is reallocated in ext3_get_block(),
2415 * unmap_underlying_metadata() will find this block
2416 * and will try to get rid of it. damn, damn. Thus
2417 * we don't allow a block to be reallocated until
2418 * a transaction freeing it has fully committed.
2419 *
2420 * We also have to make sure journal replay after a
2421 * crash does not overwrite non-journaled data blocks
2422 * with old metadata when the block got reallocated for
2423 * data. Thus we have to store a revoke record for a
2424 * block in the same transaction in which we free the
2425 * block.
2426 */
2427 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2428
2429 ext3_free_blocks(handle, inode, nr, 1);
2430
2431 if (parent_bh) {
2432 /*
2433 * The block which we have just freed is
2434 * pointed to by an indirect block: journal it
2435 */
2436 BUFFER_TRACE(parent_bh, "get_write_access");
2437 if (!ext3_journal_get_write_access(handle,
2438 parent_bh)){
2439 *p = 0;
2440 BUFFER_TRACE(parent_bh,
2441 "call ext3_journal_dirty_metadata");
2442 ext3_journal_dirty_metadata(handle,
2443 parent_bh);
2444 }
2445 }
2446 }
2447 } else {
2448 /* We have reached the bottom of the tree. */
2449 BUFFER_TRACE(parent_bh, "free data blocks");
2450 ext3_free_data(handle, inode, parent_bh, first, last);
2451 }
2452}
2453
2454int ext3_can_truncate(struct inode *inode)
2455{
2456 if (S_ISREG(inode->i_mode))
2457 return 1;
2458 if (S_ISDIR(inode->i_mode))
2459 return 1;
2460 if (S_ISLNK(inode->i_mode))
2461 return !ext3_inode_is_fast_symlink(inode);
2462 return 0;
2463}
2464
2465/*
2466 * ext3_truncate()
2467 *
2468 * We block out ext3_get_block() block instantiations across the entire
2469 * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2470 * simultaneously on behalf of the same inode.
2471 *
2472 * As we work through the truncate and commit bits of it to the journal there
2473 * is one core, guiding principle: the file's tree must always be consistent on
2474 * disk. We must be able to restart the truncate after a crash.
2475 *
2476 * The file's tree may be transiently inconsistent in memory (although it
2477 * probably isn't), but whenever we close off and commit a journal transaction,
2478 * the contents of (the filesystem + the journal) must be consistent and
2479 * restartable. It's pretty simple, really: bottom up, right to left (although
2480 * left-to-right works OK too).
2481 *
2482 * Note that at recovery time, journal replay occurs *before* the restart of
2483 * truncate against the orphan inode list.
2484 *
2485 * The committed inode has the new, desired i_size (which is the same as
2486 * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
2487 * that this inode's truncate did not complete and it will again call
2488 * ext3_truncate() to have another go. So there will be instantiated blocks
2489 * to the right of the truncation point in a crashed ext3 filesystem. But
2490 * that's fine - as long as they are linked from the inode, the post-crash
2491 * ext3_truncate() run will find them and release them.
2492 */
2493void ext3_truncate(struct inode *inode)
2494{
2495 handle_t *handle;
2496 struct ext3_inode_info *ei = EXT3_I(inode);
2497 __le32 *i_data = ei->i_data;
2498 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2499 int offsets[4];
2500 Indirect chain[4];
2501 Indirect *partial;
2502 __le32 nr = 0;
2503 int n;
2504 long last_block;
2505 unsigned blocksize = inode->i_sb->s_blocksize;
2506
2507 trace_ext3_truncate_enter(inode);
2508
2509 if (!ext3_can_truncate(inode))
2510 goto out_notrans;
2511
2512 if (inode->i_size == 0 && ext3_should_writeback_data(inode))
2513 ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
2514
2515 handle = start_transaction(inode);
2516 if (IS_ERR(handle))
2517 goto out_notrans;
2518
2519 last_block = (inode->i_size + blocksize-1)
2520 >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2521 n = ext3_block_to_path(inode, last_block, offsets, NULL);
2522 if (n == 0)
2523 goto out_stop; /* error */
2524
2525 /*
2526 * OK. This truncate is going to happen. We add the inode to the
2527 * orphan list, so that if this truncate spans multiple transactions,
2528 * and we crash, we will resume the truncate when the filesystem
2529 * recovers. It also marks the inode dirty, to catch the new size.
2530 *
2531 * Implication: the file must always be in a sane, consistent
2532 * truncatable state while each transaction commits.
2533 */
2534 if (ext3_orphan_add(handle, inode))
2535 goto out_stop;
2536
2537 /*
2538 * The orphan list entry will now protect us from any crash which
2539 * occurs before the truncate completes, so it is now safe to propagate
2540 * the new, shorter inode size (held for now in i_size) into the
2541 * on-disk inode. We do this via i_disksize, which is the value which
2542 * ext3 *really* writes onto the disk inode.
2543 */
2544 ei->i_disksize = inode->i_size;
2545
2546 /*
2547 * From here we block out all ext3_get_block() callers who want to
2548 * modify the block allocation tree.
2549 */
2550 mutex_lock(&ei->truncate_mutex);
2551
2552 if (n == 1) { /* direct blocks */
2553 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2554 i_data + EXT3_NDIR_BLOCKS);
2555 goto do_indirects;
2556 }
2557
2558 partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2559 /* Kill the top of shared branch (not detached) */
2560 if (nr) {
2561 if (partial == chain) {
2562 /* Shared branch grows from the inode */
2563 ext3_free_branches(handle, inode, NULL,
2564 &nr, &nr+1, (chain+n-1) - partial);
2565 *partial->p = 0;
2566 /*
2567 * We mark the inode dirty prior to restart,
2568 * and prior to stop. No need for it here.
2569 */
2570 } else {
2571 /* Shared branch grows from an indirect block */
2572 ext3_free_branches(handle, inode, partial->bh,
2573 partial->p,
2574 partial->p+1, (chain+n-1) - partial);
2575 }
2576 }
2577 /* Clear the ends of indirect blocks on the shared branch */
2578 while (partial > chain) {
2579 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2580 (__le32*)partial->bh->b_data+addr_per_block,
2581 (chain+n-1) - partial);
2582 BUFFER_TRACE(partial->bh, "call brelse");
2583 brelse (partial->bh);
2584 partial--;
2585 }
2586do_indirects:
2587 /* Kill the remaining (whole) subtrees */
2588 switch (offsets[0]) {
2589 default:
2590 nr = i_data[EXT3_IND_BLOCK];
2591 if (nr) {
2592 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
2593 i_data[EXT3_IND_BLOCK] = 0;
2594 }
2595 case EXT3_IND_BLOCK:
2596 nr = i_data[EXT3_DIND_BLOCK];
2597 if (nr) {
2598 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
2599 i_data[EXT3_DIND_BLOCK] = 0;
2600 }
2601 case EXT3_DIND_BLOCK:
2602 nr = i_data[EXT3_TIND_BLOCK];
2603 if (nr) {
2604 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
2605 i_data[EXT3_TIND_BLOCK] = 0;
2606 }
2607 case EXT3_TIND_BLOCK:
2608 ;
2609 }
2610
2611 ext3_discard_reservation(inode);
2612
2613 mutex_unlock(&ei->truncate_mutex);
2614 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
2615 ext3_mark_inode_dirty(handle, inode);
2616
2617 /*
2618 * In a multi-transaction truncate, we only make the final transaction
2619 * synchronous
2620 */
2621 if (IS_SYNC(inode))
2622 handle->h_sync = 1;
2623out_stop:
2624 /*
2625 * If this was a simple ftruncate(), and the file will remain alive
2626 * then we need to clear up the orphan record which we created above.
2627 * However, if this was a real unlink then we were called by
2628 * ext3_evict_inode(), and we allow that function to clean up the
2629 * orphan info for us.
2630 */
2631 if (inode->i_nlink)
2632 ext3_orphan_del(handle, inode);
2633
2634 ext3_journal_stop(handle);
2635 trace_ext3_truncate_exit(inode);
2636 return;
2637out_notrans:
2638 /*
2639 * Delete the inode from orphan list so that it doesn't stay there
2640 * forever and trigger assertion on umount.
2641 */
2642 if (inode->i_nlink)
2643 ext3_orphan_del(NULL, inode);
2644 trace_ext3_truncate_exit(inode);
2645}
2646
2647static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
2648 unsigned long ino, struct ext3_iloc *iloc)
2649{
2650 unsigned long block_group;
2651 unsigned long offset;
2652 ext3_fsblk_t block;
2653 struct ext3_group_desc *gdp;
2654
2655 if (!ext3_valid_inum(sb, ino)) {
2656 /*
2657 * This error is already checked for in namei.c unless we are
2658 * looking at an NFS filehandle, in which case no error
2659 * report is needed
2660 */
2661 return 0;
2662 }
2663
2664 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2665 gdp = ext3_get_group_desc(sb, block_group, NULL);
2666 if (!gdp)
2667 return 0;
2668 /*
2669 * Figure out the offset within the block group inode table
2670 */
2671 offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2672 EXT3_INODE_SIZE(sb);
2673 block = le32_to_cpu(gdp->bg_inode_table) +
2674 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2675
2676 iloc->block_group = block_group;
2677 iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2678 return block;
2679}
2680
2681/*
2682 * ext3_get_inode_loc returns with an extra refcount against the inode's
2683 * underlying buffer_head on success. If 'in_mem' is true, we have all
2684 * data in memory that is needed to recreate the on-disk version of this
2685 * inode.
2686 */
2687static int __ext3_get_inode_loc(struct inode *inode,
2688 struct ext3_iloc *iloc, int in_mem)
2689{
2690 ext3_fsblk_t block;
2691 struct buffer_head *bh;
2692
2693 block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2694 if (!block)
2695 return -EIO;
2696
2697 bh = sb_getblk(inode->i_sb, block);
2698 if (unlikely(!bh)) {
2699 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2700 "unable to read inode block - "
2701 "inode=%lu, block="E3FSBLK,
2702 inode->i_ino, block);
2703 return -ENOMEM;
2704 }
2705 if (!buffer_uptodate(bh)) {
2706 lock_buffer(bh);
2707
2708 /*
2709 * If the buffer has the write error flag, we have failed
2710 * to write out another inode in the same block. In this
2711 * case, we don't have to read the block because we may
2712 * read the old inode data successfully.
2713 */
2714 if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
2715 set_buffer_uptodate(bh);
2716
2717 if (buffer_uptodate(bh)) {
2718 /* someone brought it uptodate while we waited */
2719 unlock_buffer(bh);
2720 goto has_buffer;
2721 }
2722
2723 /*
2724 * If we have all information of the inode in memory and this
2725 * is the only valid inode in the block, we need not read the
2726 * block.
2727 */
2728 if (in_mem) {
2729 struct buffer_head *bitmap_bh;
2730 struct ext3_group_desc *desc;
2731 int inodes_per_buffer;
2732 int inode_offset, i;
2733 int block_group;
2734 int start;
2735
2736 block_group = (inode->i_ino - 1) /
2737 EXT3_INODES_PER_GROUP(inode->i_sb);
2738 inodes_per_buffer = bh->b_size /
2739 EXT3_INODE_SIZE(inode->i_sb);
2740 inode_offset = ((inode->i_ino - 1) %
2741 EXT3_INODES_PER_GROUP(inode->i_sb));
2742 start = inode_offset & ~(inodes_per_buffer - 1);
2743
2744 /* Is the inode bitmap in cache? */
2745 desc = ext3_get_group_desc(inode->i_sb,
2746 block_group, NULL);
2747 if (!desc)
2748 goto make_io;
2749
2750 bitmap_bh = sb_getblk(inode->i_sb,
2751 le32_to_cpu(desc->bg_inode_bitmap));
2752 if (unlikely(!bitmap_bh))
2753 goto make_io;
2754
2755 /*
2756 * If the inode bitmap isn't in cache then the
2757 * optimisation may end up performing two reads instead
2758 * of one, so skip it.
2759 */
2760 if (!buffer_uptodate(bitmap_bh)) {
2761 brelse(bitmap_bh);
2762 goto make_io;
2763 }
2764 for (i = start; i < start + inodes_per_buffer; i++) {
2765 if (i == inode_offset)
2766 continue;
2767 if (ext3_test_bit(i, bitmap_bh->b_data))
2768 break;
2769 }
2770 brelse(bitmap_bh);
2771 if (i == start + inodes_per_buffer) {
2772 /* all other inodes are free, so skip I/O */
2773 memset(bh->b_data, 0, bh->b_size);
2774 set_buffer_uptodate(bh);
2775 unlock_buffer(bh);
2776 goto has_buffer;
2777 }
2778 }
2779
2780make_io:
2781 /*
2782 * There are other valid inodes in the buffer, this inode
2783 * has in-inode xattrs, or we don't have this inode in memory.
2784 * Read the block from disk.
2785 */
2786 trace_ext3_load_inode(inode);
2787 get_bh(bh);
2788 bh->b_end_io = end_buffer_read_sync;
2789 submit_bh(READ | REQ_META | REQ_PRIO, bh);
2790 wait_on_buffer(bh);
2791 if (!buffer_uptodate(bh)) {
2792 ext3_error(inode->i_sb, "ext3_get_inode_loc",
2793 "unable to read inode block - "
2794 "inode=%lu, block="E3FSBLK,
2795 inode->i_ino, block);
2796 brelse(bh);
2797 return -EIO;
2798 }
2799 }
2800has_buffer:
2801 iloc->bh = bh;
2802 return 0;
2803}
2804
2805int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
2806{
2807 /* We have all inode data except xattrs in memory here. */
2808 return __ext3_get_inode_loc(inode, iloc,
2809 !ext3_test_inode_state(inode, EXT3_STATE_XATTR));
2810}
2811
2812void ext3_set_inode_flags(struct inode *inode)
2813{
2814 unsigned int flags = EXT3_I(inode)->i_flags;
2815
2816 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2817 if (flags & EXT3_SYNC_FL)
2818 inode->i_flags |= S_SYNC;
2819 if (flags & EXT3_APPEND_FL)
2820 inode->i_flags |= S_APPEND;
2821 if (flags & EXT3_IMMUTABLE_FL)
2822 inode->i_flags |= S_IMMUTABLE;
2823 if (flags & EXT3_NOATIME_FL)
2824 inode->i_flags |= S_NOATIME;
2825 if (flags & EXT3_DIRSYNC_FL)
2826 inode->i_flags |= S_DIRSYNC;
2827}
2828
2829/* Propagate flags from i_flags to EXT3_I(inode)->i_flags */
2830void ext3_get_inode_flags(struct ext3_inode_info *ei)
2831{
2832 unsigned int flags = ei->vfs_inode.i_flags;
2833
2834 ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL|
2835 EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL);
2836 if (flags & S_SYNC)
2837 ei->i_flags |= EXT3_SYNC_FL;
2838 if (flags & S_APPEND)
2839 ei->i_flags |= EXT3_APPEND_FL;
2840 if (flags & S_IMMUTABLE)
2841 ei->i_flags |= EXT3_IMMUTABLE_FL;
2842 if (flags & S_NOATIME)
2843 ei->i_flags |= EXT3_NOATIME_FL;
2844 if (flags & S_DIRSYNC)
2845 ei->i_flags |= EXT3_DIRSYNC_FL;
2846}
2847
2848struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2849{
2850 struct ext3_iloc iloc;
2851 struct ext3_inode *raw_inode;
2852 struct ext3_inode_info *ei;
2853 struct buffer_head *bh;
2854 struct inode *inode;
2855 journal_t *journal = EXT3_SB(sb)->s_journal;
2856 transaction_t *transaction;
2857 long ret;
2858 int block;
2859 uid_t i_uid;
2860 gid_t i_gid;
2861
2862 inode = iget_locked(sb, ino);
2863 if (!inode)
2864 return ERR_PTR(-ENOMEM);
2865 if (!(inode->i_state & I_NEW))
2866 return inode;
2867
2868 ei = EXT3_I(inode);
2869 ei->i_block_alloc_info = NULL;
2870
2871 ret = __ext3_get_inode_loc(inode, &iloc, 0);
2872 if (ret < 0)
2873 goto bad_inode;
2874 bh = iloc.bh;
2875 raw_inode = ext3_raw_inode(&iloc);
2876 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2877 i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2878 i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2879 if(!(test_opt (inode->i_sb, NO_UID32))) {
2880 i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2881 i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2882 }
2883 i_uid_write(inode, i_uid);
2884 i_gid_write(inode, i_gid);
2885 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
2886 inode->i_size = le32_to_cpu(raw_inode->i_size);
2887 inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
2888 inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
2889 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
2890 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2891
2892 ei->i_state_flags = 0;
2893 ei->i_dir_start_lookup = 0;
2894 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2895 /* We now have enough fields to check if the inode was active or not.
2896 * This is needed because nfsd might try to access dead inodes
2897 * the test is that same one that e2fsck uses
2898 * NeilBrown 1999oct15
2899 */
2900 if (inode->i_nlink == 0) {
2901 if (inode->i_mode == 0 ||
2902 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2903 /* this inode is deleted */
2904 brelse (bh);
2905 ret = -ESTALE;
2906 goto bad_inode;
2907 }
2908 /* The only unlinked inodes we let through here have
2909 * valid i_mode and are being read by the orphan
2910 * recovery code: that's fine, we're about to complete
2911 * the process of deleting those. */
2912 }
2913 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2914 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2915#ifdef EXT3_FRAGMENTS
2916 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2917 ei->i_frag_no = raw_inode->i_frag;
2918 ei->i_frag_size = raw_inode->i_fsize;
2919#endif
2920 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2921 if (!S_ISREG(inode->i_mode)) {
2922 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2923 } else {
2924 inode->i_size |=
2925 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2926 }
2927 ei->i_disksize = inode->i_size;
2928 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2929 ei->i_block_group = iloc.block_group;
2930 /*
2931 * NOTE! The in-memory inode i_data array is in little-endian order
2932 * even on big-endian machines: we do NOT byteswap the block numbers!
2933 */
2934 for (block = 0; block < EXT3_N_BLOCKS; block++)
2935 ei->i_data[block] = raw_inode->i_block[block];
2936 INIT_LIST_HEAD(&ei->i_orphan);
2937
2938 /*
2939 * Set transaction id's of transactions that have to be committed
2940 * to finish f[data]sync. We set them to currently running transaction
2941 * as we cannot be sure that the inode or some of its metadata isn't
2942 * part of the transaction - the inode could have been reclaimed and
2943 * now it is reread from disk.
2944 */
2945 if (journal) {
2946 tid_t tid;
2947
2948 spin_lock(&journal->j_state_lock);
2949 if (journal->j_running_transaction)
2950 transaction = journal->j_running_transaction;
2951 else
2952 transaction = journal->j_committing_transaction;
2953 if (transaction)
2954 tid = transaction->t_tid;
2955 else
2956 tid = journal->j_commit_sequence;
2957 spin_unlock(&journal->j_state_lock);
2958 atomic_set(&ei->i_sync_tid, tid);
2959 atomic_set(&ei->i_datasync_tid, tid);
2960 }
2961
2962 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
2963 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
2964 /*
2965 * When mke2fs creates big inodes it does not zero out
2966 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
2967 * so ignore those first few inodes.
2968 */
2969 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2970 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2971 EXT3_INODE_SIZE(inode->i_sb)) {
2972 brelse (bh);
2973 ret = -EIO;
2974 goto bad_inode;
2975 }
2976 if (ei->i_extra_isize == 0) {
2977 /* The extra space is currently unused. Use it. */
2978 ei->i_extra_isize = sizeof(struct ext3_inode) -
2979 EXT3_GOOD_OLD_INODE_SIZE;
2980 } else {
2981 __le32 *magic = (void *)raw_inode +
2982 EXT3_GOOD_OLD_INODE_SIZE +
2983 ei->i_extra_isize;
2984 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
2985 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
2986 }
2987 } else
2988 ei->i_extra_isize = 0;
2989
2990 if (S_ISREG(inode->i_mode)) {
2991 inode->i_op = &ext3_file_inode_operations;
2992 inode->i_fop = &ext3_file_operations;
2993 ext3_set_aops(inode);
2994 } else if (S_ISDIR(inode->i_mode)) {
2995 inode->i_op = &ext3_dir_inode_operations;
2996 inode->i_fop = &ext3_dir_operations;
2997 } else if (S_ISLNK(inode->i_mode)) {
2998 if (ext3_inode_is_fast_symlink(inode)) {
2999 inode->i_op = &ext3_fast_symlink_inode_operations;
3000 nd_terminate_link(ei->i_data, inode->i_size,
3001 sizeof(ei->i_data) - 1);
3002 inode->i_link = (char *)ei->i_data;
3003 } else {
3004 inode->i_op = &ext3_symlink_inode_operations;
3005 ext3_set_aops(inode);
3006 }
3007 } else {
3008 inode->i_op = &ext3_special_inode_operations;
3009 if (raw_inode->i_block[0])
3010 init_special_inode(inode, inode->i_mode,
3011 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
3012 else
3013 init_special_inode(inode, inode->i_mode,
3014 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
3015 }
3016 brelse (iloc.bh);
3017 ext3_set_inode_flags(inode);
3018 unlock_new_inode(inode);
3019 return inode;
3020
3021bad_inode:
3022 iget_failed(inode);
3023 return ERR_PTR(ret);
3024}
3025
3026/*
3027 * Post the struct inode info into an on-disk inode location in the
3028 * buffer-cache. This gobbles the caller's reference to the
3029 * buffer_head in the inode location struct.
3030 *
3031 * The caller must have write access to iloc->bh.
3032 */
3033static int ext3_do_update_inode(handle_t *handle,
3034 struct inode *inode,
3035 struct ext3_iloc *iloc)
3036{
3037 struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
3038 struct ext3_inode_info *ei = EXT3_I(inode);
3039 struct buffer_head *bh = iloc->bh;
3040 int err = 0, rc, block;
3041 int need_datasync = 0;
3042 __le32 disksize;
3043 uid_t i_uid;
3044 gid_t i_gid;
3045
3046again:
3047 /* we can't allow multiple procs in here at once, its a bit racey */
3048 lock_buffer(bh);
3049
3050 /* For fields not not tracking in the in-memory inode,
3051 * initialise them to zero for new inodes. */
3052 if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
3053 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
3054
3055 ext3_get_inode_flags(ei);
3056 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
3057 i_uid = i_uid_read(inode);
3058 i_gid = i_gid_read(inode);
3059 if(!(test_opt(inode->i_sb, NO_UID32))) {
3060 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
3061 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
3062/*
3063 * Fix up interoperability with old kernels. Otherwise, old inodes get
3064 * re-used with the upper 16 bits of the uid/gid intact
3065 */
3066 if(!ei->i_dtime) {
3067 raw_inode->i_uid_high =
3068 cpu_to_le16(high_16_bits(i_uid));
3069 raw_inode->i_gid_high =
3070 cpu_to_le16(high_16_bits(i_gid));
3071 } else {
3072 raw_inode->i_uid_high = 0;
3073 raw_inode->i_gid_high = 0;
3074 }
3075 } else {
3076 raw_inode->i_uid_low =
3077 cpu_to_le16(fs_high2lowuid(i_uid));
3078 raw_inode->i_gid_low =
3079 cpu_to_le16(fs_high2lowgid(i_gid));
3080 raw_inode->i_uid_high = 0;
3081 raw_inode->i_gid_high = 0;
3082 }
3083 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
3084 disksize = cpu_to_le32(ei->i_disksize);
3085 if (disksize != raw_inode->i_size) {
3086 need_datasync = 1;
3087 raw_inode->i_size = disksize;
3088 }
3089 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
3090 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
3091 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
3092 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
3093 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
3094 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
3095#ifdef EXT3_FRAGMENTS
3096 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
3097 raw_inode->i_frag = ei->i_frag_no;
3098 raw_inode->i_fsize = ei->i_frag_size;
3099#endif
3100 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
3101 if (!S_ISREG(inode->i_mode)) {
3102 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
3103 } else {
3104 disksize = cpu_to_le32(ei->i_disksize >> 32);
3105 if (disksize != raw_inode->i_size_high) {
3106 raw_inode->i_size_high = disksize;
3107 need_datasync = 1;
3108 }
3109 if (ei->i_disksize > 0x7fffffffULL) {
3110 struct super_block *sb = inode->i_sb;
3111 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
3112 EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
3113 EXT3_SB(sb)->s_es->s_rev_level ==
3114 cpu_to_le32(EXT3_GOOD_OLD_REV)) {
3115 /* If this is the first large file
3116 * created, add a flag to the superblock.
3117 */
3118 unlock_buffer(bh);
3119 err = ext3_journal_get_write_access(handle,
3120 EXT3_SB(sb)->s_sbh);
3121 if (err)
3122 goto out_brelse;
3123
3124 ext3_update_dynamic_rev(sb);
3125 EXT3_SET_RO_COMPAT_FEATURE(sb,
3126 EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
3127 handle->h_sync = 1;
3128 err = ext3_journal_dirty_metadata(handle,
3129 EXT3_SB(sb)->s_sbh);
3130 /* get our lock and start over */
3131 goto again;
3132 }
3133 }
3134 }
3135 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
3136 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
3137 if (old_valid_dev(inode->i_rdev)) {
3138 raw_inode->i_block[0] =
3139 cpu_to_le32(old_encode_dev(inode->i_rdev));
3140 raw_inode->i_block[1] = 0;
3141 } else {
3142 raw_inode->i_block[0] = 0;
3143 raw_inode->i_block[1] =
3144 cpu_to_le32(new_encode_dev(inode->i_rdev));
3145 raw_inode->i_block[2] = 0;
3146 }
3147 } else for (block = 0; block < EXT3_N_BLOCKS; block++)
3148 raw_inode->i_block[block] = ei->i_data[block];
3149
3150 if (ei->i_extra_isize)
3151 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
3152
3153 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
3154 unlock_buffer(bh);
3155 rc = ext3_journal_dirty_metadata(handle, bh);
3156 if (!err)
3157 err = rc;
3158 ext3_clear_inode_state(inode, EXT3_STATE_NEW);
3159
3160 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
3161 if (need_datasync)
3162 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
3163out_brelse:
3164 brelse (bh);
3165 ext3_std_error(inode->i_sb, err);
3166 return err;
3167}
3168
3169/*
3170 * ext3_write_inode()
3171 *
3172 * We are called from a few places:
3173 *
3174 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
3175 * Here, there will be no transaction running. We wait for any running
3176 * transaction to commit.
3177 *
3178 * - Within flush work (for sys_sync(), kupdate and such).
3179 * We wait on commit, if told to.
3180 *
3181 * - Within iput_final() -> write_inode_now()
3182 * We wait on commit, if told to.
3183 *
3184 * In all cases it is actually safe for us to return without doing anything,
3185 * because the inode has been copied into a raw inode buffer in
3186 * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
3187 * writeback.
3188 *
3189 * Note that we are absolutely dependent upon all inode dirtiers doing the
3190 * right thing: they *must* call mark_inode_dirty() after dirtying info in
3191 * which we are interested.
3192 *
3193 * It would be a bug for them to not do this. The code:
3194 *
3195 * mark_inode_dirty(inode)
3196 * stuff();
3197 * inode->i_size = expr;
3198 *
3199 * is in error because write_inode() could occur while `stuff()' is running,
3200 * and the new i_size will be lost. Plus the inode will no longer be on the
3201 * superblock's dirty inode list.
3202 */
3203int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
3204{
3205 if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
3206 return 0;
3207
3208 if (ext3_journal_current_handle()) {
3209 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
3210 dump_stack();
3211 return -EIO;
3212 }
3213
3214 /*
3215 * No need to force transaction in WB_SYNC_NONE mode. Also
3216 * ext3_sync_fs() will force the commit after everything is
3217 * written.
3218 */
3219 if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
3220 return 0;
3221
3222 return ext3_force_commit(inode->i_sb);
3223}
3224
3225/*
3226 * ext3_setattr()
3227 *
3228 * Called from notify_change.
3229 *
3230 * We want to trap VFS attempts to truncate the file as soon as
3231 * possible. In particular, we want to make sure that when the VFS
3232 * shrinks i_size, we put the inode on the orphan list and modify
3233 * i_disksize immediately, so that during the subsequent flushing of
3234 * dirty pages and freeing of disk blocks, we can guarantee that any
3235 * commit will leave the blocks being flushed in an unused state on
3236 * disk. (On recovery, the inode will get truncated and the blocks will
3237 * be freed, so we have a strong guarantee that no future commit will
3238 * leave these blocks visible to the user.)
3239 *
3240 * Called with inode->sem down.
3241 */
3242int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3243{
3244 struct inode *inode = d_inode(dentry);
3245 int error, rc = 0;
3246 const unsigned int ia_valid = attr->ia_valid;
3247
3248 error = inode_change_ok(inode, attr);
3249 if (error)
3250 return error;
3251
3252 if (is_quota_modification(inode, attr))
3253 dquot_initialize(inode);
3254 if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
3255 (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
3256 handle_t *handle;
3257
3258 /* (user+group)*(old+new) structure, inode write (sb,
3259 * inode block, ? - but truncate inode update has it) */
3260 handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
3261 EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3);
3262 if (IS_ERR(handle)) {
3263 error = PTR_ERR(handle);
3264 goto err_out;
3265 }
3266 error = dquot_transfer(inode, attr);
3267 if (error) {
3268 ext3_journal_stop(handle);
3269 return error;
3270 }
3271 /* Update corresponding info in inode so that everything is in
3272 * one transaction */
3273 if (attr->ia_valid & ATTR_UID)
3274 inode->i_uid = attr->ia_uid;
3275 if (attr->ia_valid & ATTR_GID)
3276 inode->i_gid = attr->ia_gid;
3277 error = ext3_mark_inode_dirty(handle, inode);
3278 ext3_journal_stop(handle);
3279 }
3280
3281 if (attr->ia_valid & ATTR_SIZE)
3282 inode_dio_wait(inode);
3283
3284 if (S_ISREG(inode->i_mode) &&
3285 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
3286 handle_t *handle;
3287
3288 handle = ext3_journal_start(inode, 3);
3289 if (IS_ERR(handle)) {
3290 error = PTR_ERR(handle);
3291 goto err_out;
3292 }
3293
3294 error = ext3_orphan_add(handle, inode);
3295 if (error) {
3296 ext3_journal_stop(handle);
3297 goto err_out;
3298 }
3299 EXT3_I(inode)->i_disksize = attr->ia_size;
3300 error = ext3_mark_inode_dirty(handle, inode);
3301 ext3_journal_stop(handle);
3302 if (error) {
3303 /* Some hard fs error must have happened. Bail out. */
3304 ext3_orphan_del(NULL, inode);
3305 goto err_out;
3306 }
3307 rc = ext3_block_truncate_page(inode, attr->ia_size);
3308 if (rc) {
3309 /* Cleanup orphan list and exit */
3310 handle = ext3_journal_start(inode, 3);
3311 if (IS_ERR(handle)) {
3312 ext3_orphan_del(NULL, inode);
3313 goto err_out;
3314 }
3315 ext3_orphan_del(handle, inode);
3316 ext3_journal_stop(handle);
3317 goto err_out;
3318 }
3319 }
3320
3321 if ((attr->ia_valid & ATTR_SIZE) &&
3322 attr->ia_size != i_size_read(inode)) {
3323 truncate_setsize(inode, attr->ia_size);
3324 ext3_truncate(inode);
3325 }
3326
3327 setattr_copy(inode, attr);
3328 mark_inode_dirty(inode);
3329
3330 if (ia_valid & ATTR_MODE)
3331 rc = posix_acl_chmod(inode, inode->i_mode);
3332
3333err_out:
3334 ext3_std_error(inode->i_sb, error);
3335 if (!error)
3336 error = rc;
3337 return error;
3338}
3339
3340
3341/*
3342 * How many blocks doth make a writepage()?
3343 *
3344 * With N blocks per page, it may be:
3345 * N data blocks
3346 * 2 indirect block
3347 * 2 dindirect
3348 * 1 tindirect
3349 * N+5 bitmap blocks (from the above)
3350 * N+5 group descriptor summary blocks
3351 * 1 inode block
3352 * 1 superblock.
3353 * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
3354 *
3355 * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
3356 *
3357 * With ordered or writeback data it's the same, less the N data blocks.
3358 *
3359 * If the inode's direct blocks can hold an integral number of pages then a
3360 * page cannot straddle two indirect blocks, and we can only touch one indirect
3361 * and dindirect block, and the "5" above becomes "3".
3362 *
3363 * This still overestimates under most circumstances. If we were to pass the
3364 * start and end offsets in here as well we could do block_to_path() on each
3365 * block and work out the exact number of indirects which are touched. Pah.
3366 */
3367
3368static int ext3_writepage_trans_blocks(struct inode *inode)
3369{
3370 int bpp = ext3_journal_blocks_per_page(inode);
3371 int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
3372 int ret;
3373
3374 if (ext3_should_journal_data(inode))
3375 ret = 3 * (bpp + indirects) + 2;
3376 else
3377 ret = 2 * (bpp + indirects) + indirects + 2;
3378
3379#ifdef CONFIG_QUOTA
3380 /* We know that structure was already allocated during dquot_initialize so
3381 * we will be updating only the data blocks + inodes */
3382 ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
3383#endif
3384
3385 return ret;
3386}
3387
3388/*
3389 * The caller must have previously called ext3_reserve_inode_write().
3390 * Give this, we know that the caller already has write access to iloc->bh.
3391 */
3392int ext3_mark_iloc_dirty(handle_t *handle,
3393 struct inode *inode, struct ext3_iloc *iloc)
3394{
3395 int err = 0;
3396
3397 /* the do_update_inode consumes one bh->b_count */
3398 get_bh(iloc->bh);
3399
3400 /* ext3_do_update_inode() does journal_dirty_metadata */
3401 err = ext3_do_update_inode(handle, inode, iloc);
3402 put_bh(iloc->bh);
3403 return err;
3404}
3405
3406/*
3407 * On success, We end up with an outstanding reference count against
3408 * iloc->bh. This _must_ be cleaned up later.
3409 */
3410
3411int
3412ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
3413 struct ext3_iloc *iloc)
3414{
3415 int err = 0;
3416 if (handle) {
3417 err = ext3_get_inode_loc(inode, iloc);
3418 if (!err) {
3419 BUFFER_TRACE(iloc->bh, "get_write_access");
3420 err = ext3_journal_get_write_access(handle, iloc->bh);
3421 if (err) {
3422 brelse(iloc->bh);
3423 iloc->bh = NULL;
3424 }
3425 }
3426 }
3427 ext3_std_error(inode->i_sb, err);
3428 return err;
3429}
3430
3431/*
3432 * What we do here is to mark the in-core inode as clean with respect to inode
3433 * dirtiness (it may still be data-dirty).
3434 * This means that the in-core inode may be reaped by prune_icache
3435 * without having to perform any I/O. This is a very good thing,
3436 * because *any* task may call prune_icache - even ones which
3437 * have a transaction open against a different journal.
3438 *
3439 * Is this cheating? Not really. Sure, we haven't written the
3440 * inode out, but prune_icache isn't a user-visible syncing function.
3441 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
3442 * we start and wait on commits.
3443 */
3444int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3445{
3446 struct ext3_iloc iloc;
3447 int err;
3448
3449 might_sleep();
3450 trace_ext3_mark_inode_dirty(inode, _RET_IP_);
3451 err = ext3_reserve_inode_write(handle, inode, &iloc);
3452 if (!err)
3453 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
3454 return err;
3455}
3456
3457/*
3458 * ext3_dirty_inode() is called from __mark_inode_dirty()
3459 *
3460 * We're really interested in the case where a file is being extended.
3461 * i_size has been changed by generic_commit_write() and we thus need
3462 * to include the updated inode in the current transaction.
3463 *
3464 * Also, dquot_alloc_space() will always dirty the inode when blocks
3465 * are allocated to the file.
3466 *
3467 * If the inode is marked synchronous, we don't honour that here - doing
3468 * so would cause a commit on atime updates, which we don't bother doing.
3469 * We handle synchronous inodes at the highest possible level.
3470 */
3471void ext3_dirty_inode(struct inode *inode, int flags)
3472{
3473 handle_t *current_handle = ext3_journal_current_handle();
3474 handle_t *handle;
3475
3476 handle = ext3_journal_start(inode, 2);
3477 if (IS_ERR(handle))
3478 goto out;
3479 if (current_handle &&
3480 current_handle->h_transaction != handle->h_transaction) {
3481 /* This task has a transaction open against a different fs */
3482 printk(KERN_EMERG "%s: transactions do not match!\n",
3483 __func__);
3484 } else {
3485 jbd_debug(5, "marking dirty. outer handle=%p\n",
3486 current_handle);
3487 ext3_mark_inode_dirty(handle, inode);
3488 }
3489 ext3_journal_stop(handle);
3490out:
3491 return;
3492}
3493
3494#if 0
3495/*
3496 * Bind an inode's backing buffer_head into this transaction, to prevent
3497 * it from being flushed to disk early. Unlike
3498 * ext3_reserve_inode_write, this leaves behind no bh reference and
3499 * returns no iloc structure, so the caller needs to repeat the iloc
3500 * lookup to mark the inode dirty later.
3501 */
3502static int ext3_pin_inode(handle_t *handle, struct inode *inode)
3503{
3504 struct ext3_iloc iloc;
3505
3506 int err = 0;
3507 if (handle) {
3508 err = ext3_get_inode_loc(inode, &iloc);
3509 if (!err) {
3510 BUFFER_TRACE(iloc.bh, "get_write_access");
3511 err = journal_get_write_access(handle, iloc.bh);
3512 if (!err)
3513 err = ext3_journal_dirty_metadata(handle,
3514 iloc.bh);
3515 brelse(iloc.bh);
3516 }
3517 }
3518 ext3_std_error(inode->i_sb, err);
3519 return err;
3520}
3521#endif
3522
3523int ext3_change_inode_journal_flag(struct inode *inode, int val)
3524{
3525 journal_t *journal;
3526 handle_t *handle;
3527 int err;
3528
3529 /*
3530 * We have to be very careful here: changing a data block's
3531 * journaling status dynamically is dangerous. If we write a
3532 * data block to the journal, change the status and then delete
3533 * that block, we risk forgetting to revoke the old log record
3534 * from the journal and so a subsequent replay can corrupt data.
3535 * So, first we make sure that the journal is empty and that
3536 * nobody is changing anything.
3537 */
3538
3539 journal = EXT3_JOURNAL(inode);
3540 if (is_journal_aborted(journal))
3541 return -EROFS;
3542
3543 journal_lock_updates(journal);
3544 journal_flush(journal);
3545
3546 /*
3547 * OK, there are no updates running now, and all cached data is
3548 * synced to disk. We are now in a completely consistent state
3549 * which doesn't have anything in the journal, and we know that
3550 * no filesystem updates are running, so it is safe to modify
3551 * the inode's in-core data-journaling state flag now.
3552 */
3553
3554 if (val)
3555 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3556 else
3557 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3558 ext3_set_aops(inode);
3559
3560 journal_unlock_updates(journal);
3561
3562 /* Finally we can mark the inode as dirty. */
3563
3564 handle = ext3_journal_start(inode, 1);
3565 if (IS_ERR(handle))
3566 return PTR_ERR(handle);
3567
3568 err = ext3_mark_inode_dirty(handle, inode);
3569 handle->h_sync = 1;
3570 ext3_journal_stop(handle);
3571 ext3_std_error(inode->i_sb, err);
3572
3573 return err;
3574}
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
deleted file mode 100644
index 4d96e9a64532..000000000000
--- a/fs/ext3/ioctl.c
+++ /dev/null
@@ -1,327 +0,0 @@
1/*
2 * linux/fs/ext3/ioctl.c
3 *
4 * Copyright (C) 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 */
9
10#include <linux/mount.h>
11#include <linux/compat.h>
12#include <asm/uaccess.h>
13#include "ext3.h"
14
15long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
16{
17 struct inode *inode = file_inode(filp);
18 struct ext3_inode_info *ei = EXT3_I(inode);
19 unsigned int flags;
20 unsigned short rsv_window_size;
21
22 ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
23
24 switch (cmd) {
25 case EXT3_IOC_GETFLAGS:
26 ext3_get_inode_flags(ei);
27 flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
28 return put_user(flags, (int __user *) arg);
29 case EXT3_IOC_SETFLAGS: {
30 handle_t *handle = NULL;
31 int err;
32 struct ext3_iloc iloc;
33 unsigned int oldflags;
34 unsigned int jflag;
35
36 if (!inode_owner_or_capable(inode))
37 return -EACCES;
38
39 if (get_user(flags, (int __user *) arg))
40 return -EFAULT;
41
42 err = mnt_want_write_file(filp);
43 if (err)
44 return err;
45
46 flags = ext3_mask_flags(inode->i_mode, flags);
47
48 mutex_lock(&inode->i_mutex);
49
50 /* Is it quota file? Do not allow user to mess with it */
51 err = -EPERM;
52 if (IS_NOQUOTA(inode))
53 goto flags_out;
54
55 oldflags = ei->i_flags;
56
57 /* The JOURNAL_DATA flag is modifiable only by root */
58 jflag = flags & EXT3_JOURNAL_DATA_FL;
59
60 /*
61 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
62 * the relevant capability.
63 *
64 * This test looks nicer. Thanks to Pauline Middelink
65 */
66 if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
67 if (!capable(CAP_LINUX_IMMUTABLE))
68 goto flags_out;
69 }
70
71 /*
72 * The JOURNAL_DATA flag can only be changed by
73 * the relevant capability.
74 */
75 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
76 if (!capable(CAP_SYS_RESOURCE))
77 goto flags_out;
78 }
79
80 handle = ext3_journal_start(inode, 1);
81 if (IS_ERR(handle)) {
82 err = PTR_ERR(handle);
83 goto flags_out;
84 }
85 if (IS_SYNC(inode))
86 handle->h_sync = 1;
87 err = ext3_reserve_inode_write(handle, inode, &iloc);
88 if (err)
89 goto flags_err;
90
91 flags = flags & EXT3_FL_USER_MODIFIABLE;
92 flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE;
93 ei->i_flags = flags;
94
95 ext3_set_inode_flags(inode);
96 inode->i_ctime = CURRENT_TIME_SEC;
97
98 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
99flags_err:
100 ext3_journal_stop(handle);
101 if (err)
102 goto flags_out;
103
104 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
105 err = ext3_change_inode_journal_flag(inode, jflag);
106flags_out:
107 mutex_unlock(&inode->i_mutex);
108 mnt_drop_write_file(filp);
109 return err;
110 }
111 case EXT3_IOC_GETVERSION:
112 case EXT3_IOC_GETVERSION_OLD:
113 return put_user(inode->i_generation, (int __user *) arg);
114 case EXT3_IOC_SETVERSION:
115 case EXT3_IOC_SETVERSION_OLD: {
116 handle_t *handle;
117 struct ext3_iloc iloc;
118 __u32 generation;
119 int err;
120
121 if (!inode_owner_or_capable(inode))
122 return -EPERM;
123
124 err = mnt_want_write_file(filp);
125 if (err)
126 return err;
127 if (get_user(generation, (int __user *) arg)) {
128 err = -EFAULT;
129 goto setversion_out;
130 }
131
132 mutex_lock(&inode->i_mutex);
133 handle = ext3_journal_start(inode, 1);
134 if (IS_ERR(handle)) {
135 err = PTR_ERR(handle);
136 goto unlock_out;
137 }
138 err = ext3_reserve_inode_write(handle, inode, &iloc);
139 if (err == 0) {
140 inode->i_ctime = CURRENT_TIME_SEC;
141 inode->i_generation = generation;
142 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
143 }
144 ext3_journal_stop(handle);
145
146unlock_out:
147 mutex_unlock(&inode->i_mutex);
148setversion_out:
149 mnt_drop_write_file(filp);
150 return err;
151 }
152 case EXT3_IOC_GETRSVSZ:
153 if (test_opt(inode->i_sb, RESERVATION)
154 && S_ISREG(inode->i_mode)
155 && ei->i_block_alloc_info) {
156 rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
157 return put_user(rsv_window_size, (int __user *)arg);
158 }
159 return -ENOTTY;
160 case EXT3_IOC_SETRSVSZ: {
161 int err;
162
163 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
164 return -ENOTTY;
165
166 err = mnt_want_write_file(filp);
167 if (err)
168 return err;
169
170 if (!inode_owner_or_capable(inode)) {
171 err = -EACCES;
172 goto setrsvsz_out;
173 }
174
175 if (get_user(rsv_window_size, (int __user *)arg)) {
176 err = -EFAULT;
177 goto setrsvsz_out;
178 }
179
180 if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
181 rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
182
183 /*
184 * need to allocate reservation structure for this inode
185 * before set the window size
186 */
187 mutex_lock(&ei->truncate_mutex);
188 if (!ei->i_block_alloc_info)
189 ext3_init_block_alloc_info(inode);
190
191 if (ei->i_block_alloc_info){
192 struct ext3_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
193 rsv->rsv_goal_size = rsv_window_size;
194 }
195 mutex_unlock(&ei->truncate_mutex);
196setrsvsz_out:
197 mnt_drop_write_file(filp);
198 return err;
199 }
200 case EXT3_IOC_GROUP_EXTEND: {
201 ext3_fsblk_t n_blocks_count;
202 struct super_block *sb = inode->i_sb;
203 int err, err2;
204
205 if (!capable(CAP_SYS_RESOURCE))
206 return -EPERM;
207
208 err = mnt_want_write_file(filp);
209 if (err)
210 return err;
211
212 if (get_user(n_blocks_count, (__u32 __user *)arg)) {
213 err = -EFAULT;
214 goto group_extend_out;
215 }
216 err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
217 journal_lock_updates(EXT3_SB(sb)->s_journal);
218 err2 = journal_flush(EXT3_SB(sb)->s_journal);
219 journal_unlock_updates(EXT3_SB(sb)->s_journal);
220 if (err == 0)
221 err = err2;
222group_extend_out:
223 mnt_drop_write_file(filp);
224 return err;
225 }
226 case EXT3_IOC_GROUP_ADD: {
227 struct ext3_new_group_data input;
228 struct super_block *sb = inode->i_sb;
229 int err, err2;
230
231 if (!capable(CAP_SYS_RESOURCE))
232 return -EPERM;
233
234 err = mnt_want_write_file(filp);
235 if (err)
236 return err;
237
238 if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg,
239 sizeof(input))) {
240 err = -EFAULT;
241 goto group_add_out;
242 }
243
244 err = ext3_group_add(sb, &input);
245 journal_lock_updates(EXT3_SB(sb)->s_journal);
246 err2 = journal_flush(EXT3_SB(sb)->s_journal);
247 journal_unlock_updates(EXT3_SB(sb)->s_journal);
248 if (err == 0)
249 err = err2;
250group_add_out:
251 mnt_drop_write_file(filp);
252 return err;
253 }
254 case FITRIM: {
255
256 struct super_block *sb = inode->i_sb;
257 struct fstrim_range range;
258 int ret = 0;
259
260 if (!capable(CAP_SYS_ADMIN))
261 return -EPERM;
262
263 if (copy_from_user(&range, (struct fstrim_range __user *)arg,
264 sizeof(range)))
265 return -EFAULT;
266
267 ret = ext3_trim_fs(sb, &range);
268 if (ret < 0)
269 return ret;
270
271 if (copy_to_user((struct fstrim_range __user *)arg, &range,
272 sizeof(range)))
273 return -EFAULT;
274
275 return 0;
276 }
277
278 default:
279 return -ENOTTY;
280 }
281}
282
283#ifdef CONFIG_COMPAT
284long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
285{
286 /* These are just misnamed, they actually get/put from/to user an int */
287 switch (cmd) {
288 case EXT3_IOC32_GETFLAGS:
289 cmd = EXT3_IOC_GETFLAGS;
290 break;
291 case EXT3_IOC32_SETFLAGS:
292 cmd = EXT3_IOC_SETFLAGS;
293 break;
294 case EXT3_IOC32_GETVERSION:
295 cmd = EXT3_IOC_GETVERSION;
296 break;
297 case EXT3_IOC32_SETVERSION:
298 cmd = EXT3_IOC_SETVERSION;
299 break;
300 case EXT3_IOC32_GROUP_EXTEND:
301 cmd = EXT3_IOC_GROUP_EXTEND;
302 break;
303 case EXT3_IOC32_GETVERSION_OLD:
304 cmd = EXT3_IOC_GETVERSION_OLD;
305 break;
306 case EXT3_IOC32_SETVERSION_OLD:
307 cmd = EXT3_IOC_SETVERSION_OLD;
308 break;
309#ifdef CONFIG_JBD_DEBUG
310 case EXT3_IOC32_WAIT_FOR_READONLY:
311 cmd = EXT3_IOC_WAIT_FOR_READONLY;
312 break;
313#endif
314 case EXT3_IOC32_GETRSVSZ:
315 cmd = EXT3_IOC_GETRSVSZ;
316 break;
317 case EXT3_IOC32_SETRSVSZ:
318 cmd = EXT3_IOC_SETRSVSZ;
319 break;
320 case EXT3_IOC_GROUP_ADD:
321 break;
322 default:
323 return -ENOIOCTLCMD;
324 }
325 return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
326}
327#endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
deleted file mode 100644
index c9e767cd4b67..000000000000
--- a/fs/ext3/namei.c
+++ /dev/null
@@ -1,2586 +0,0 @@
1/*
2 * linux/fs/ext3/namei.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/namei.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Big-endian to little-endian byte-swapping/bitmaps by
16 * David S. Miller (davem@caip.rutgers.edu), 1995
17 * Directory entry file type support and forward compatibility hooks
18 * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
19 * Hash Tree Directory indexing (c)
20 * Daniel Phillips, 2001
21 * Hash Tree Directory indexing porting
22 * Christopher Li, 2002
23 * Hash Tree Directory indexing cleanup
24 * Theodore Ts'o, 2002
25 */
26
27#include <linux/quotaops.h>
28#include "ext3.h"
29#include "namei.h"
30#include "xattr.h"
31#include "acl.h"
32
33/*
34 * define how far ahead to read directories while searching them.
35 */
36#define NAMEI_RA_CHUNKS 2
37#define NAMEI_RA_BLOCKS 4
38#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
39
40static struct buffer_head *ext3_append(handle_t *handle,
41 struct inode *inode,
42 u32 *block, int *err)
43{
44 struct buffer_head *bh;
45
46 *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
47
48 if ((bh = ext3_dir_bread(handle, inode, *block, 1, err))) {
49 inode->i_size += inode->i_sb->s_blocksize;
50 EXT3_I(inode)->i_disksize = inode->i_size;
51 *err = ext3_journal_get_write_access(handle, bh);
52 if (*err) {
53 brelse(bh);
54 bh = NULL;
55 }
56 }
57 return bh;
58}
59
60#ifndef assert
61#define assert(test) J_ASSERT(test)
62#endif
63
64#ifdef DX_DEBUG
65#define dxtrace(command) command
66#else
67#define dxtrace(command)
68#endif
69
70struct fake_dirent
71{
72 __le32 inode;
73 __le16 rec_len;
74 u8 name_len;
75 u8 file_type;
76};
77
78struct dx_countlimit
79{
80 __le16 limit;
81 __le16 count;
82};
83
84struct dx_entry
85{
86 __le32 hash;
87 __le32 block;
88};
89
90/*
91 * dx_root_info is laid out so that if it should somehow get overlaid by a
92 * dirent the two low bits of the hash version will be zero. Therefore, the
93 * hash version mod 4 should never be 0. Sincerely, the paranoia department.
94 */
95
96struct dx_root
97{
98 struct fake_dirent dot;
99 char dot_name[4];
100 struct fake_dirent dotdot;
101 char dotdot_name[4];
102 struct dx_root_info
103 {
104 __le32 reserved_zero;
105 u8 hash_version;
106 u8 info_length; /* 8 */
107 u8 indirect_levels;
108 u8 unused_flags;
109 }
110 info;
111 struct dx_entry entries[0];
112};
113
114struct dx_node
115{
116 struct fake_dirent fake;
117 struct dx_entry entries[0];
118};
119
120
121struct dx_frame
122{
123 struct buffer_head *bh;
124 struct dx_entry *entries;
125 struct dx_entry *at;
126};
127
128struct dx_map_entry
129{
130 u32 hash;
131 u16 offs;
132 u16 size;
133};
134
135static inline unsigned dx_get_block (struct dx_entry *entry);
136static void dx_set_block (struct dx_entry *entry, unsigned value);
137static inline unsigned dx_get_hash (struct dx_entry *entry);
138static void dx_set_hash (struct dx_entry *entry, unsigned value);
139static unsigned dx_get_count (struct dx_entry *entries);
140static unsigned dx_get_limit (struct dx_entry *entries);
141static void dx_set_count (struct dx_entry *entries, unsigned value);
142static void dx_set_limit (struct dx_entry *entries, unsigned value);
143static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
144static unsigned dx_node_limit (struct inode *dir);
145static struct dx_frame *dx_probe(struct qstr *entry,
146 struct inode *dir,
147 struct dx_hash_info *hinfo,
148 struct dx_frame *frame,
149 int *err);
150static void dx_release (struct dx_frame *frames);
151static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
152 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
153static void dx_sort_map(struct dx_map_entry *map, unsigned count);
154static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
155 struct dx_map_entry *offsets, int count);
156static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
157static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
158static int ext3_htree_next_block(struct inode *dir, __u32 hash,
159 struct dx_frame *frame,
160 struct dx_frame *frames,
161 __u32 *start_hash);
162static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
163 struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
164 int *err);
165static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
166 struct inode *inode);
167
168/*
169 * p is at least 6 bytes before the end of page
170 */
171static inline struct ext3_dir_entry_2 *
172ext3_next_entry(struct ext3_dir_entry_2 *p)
173{
174 return (struct ext3_dir_entry_2 *)((char *)p +
175 ext3_rec_len_from_disk(p->rec_len));
176}
177
178/*
179 * Future: use high four bits of block for coalesce-on-delete flags
180 * Mask them off for now.
181 */
182
183static inline unsigned dx_get_block (struct dx_entry *entry)
184{
185 return le32_to_cpu(entry->block) & 0x00ffffff;
186}
187
188static inline void dx_set_block (struct dx_entry *entry, unsigned value)
189{
190 entry->block = cpu_to_le32(value);
191}
192
193static inline unsigned dx_get_hash (struct dx_entry *entry)
194{
195 return le32_to_cpu(entry->hash);
196}
197
198static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
199{
200 entry->hash = cpu_to_le32(value);
201}
202
203static inline unsigned dx_get_count (struct dx_entry *entries)
204{
205 return le16_to_cpu(((struct dx_countlimit *) entries)->count);
206}
207
208static inline unsigned dx_get_limit (struct dx_entry *entries)
209{
210 return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
211}
212
213static inline void dx_set_count (struct dx_entry *entries, unsigned value)
214{
215 ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
216}
217
218static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
219{
220 ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
221}
222
223static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
224{
225 unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
226 EXT3_DIR_REC_LEN(2) - infosize;
227 return entry_space / sizeof(struct dx_entry);
228}
229
230static inline unsigned dx_node_limit (struct inode *dir)
231{
232 unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
233 return entry_space / sizeof(struct dx_entry);
234}
235
236/*
237 * Debug
238 */
239#ifdef DX_DEBUG
240static void dx_show_index (char * label, struct dx_entry *entries)
241{
242 int i, n = dx_get_count (entries);
243 printk("%s index ", label);
244 for (i = 0; i < n; i++)
245 {
246 printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i));
247 }
248 printk("\n");
249}
250
251struct stats
252{
253 unsigned names;
254 unsigned space;
255 unsigned bcount;
256};
257
258static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
259 int size, int show_names)
260{
261 unsigned names = 0, space = 0;
262 char *base = (char *) de;
263 struct dx_hash_info h = *hinfo;
264
265 printk("names: ");
266 while ((char *) de < base + size)
267 {
268 if (de->inode)
269 {
270 if (show_names)
271 {
272 int len = de->name_len;
273 char *name = de->name;
274 while (len--) printk("%c", *name++);
275 ext3fs_dirhash(de->name, de->name_len, &h);
276 printk(":%x.%u ", h.hash,
277 (unsigned) ((char *) de - base));
278 }
279 space += EXT3_DIR_REC_LEN(de->name_len);
280 names++;
281 }
282 de = ext3_next_entry(de);
283 }
284 printk("(%i)\n", names);
285 return (struct stats) { names, space, 1 };
286}
287
288struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
289 struct dx_entry *entries, int levels)
290{
291 unsigned blocksize = dir->i_sb->s_blocksize;
292 unsigned count = dx_get_count (entries), names = 0, space = 0, i;
293 unsigned bcount = 0;
294 struct buffer_head *bh;
295 int err;
296 printk("%i indexed blocks...\n", count);
297 for (i = 0; i < count; i++, entries++)
298 {
299 u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
300 u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
301 struct stats stats;
302 printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
303 if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
304 stats = levels?
305 dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
306 dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
307 names += stats.names;
308 space += stats.space;
309 bcount += stats.bcount;
310 brelse (bh);
311 }
312 if (bcount)
313 printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
314 names, space/bcount,(space/bcount)*100/blocksize);
315 return (struct stats) { names, space, bcount};
316}
317#endif /* DX_DEBUG */
318
319/*
320 * Probe for a directory leaf block to search.
321 *
322 * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
323 * error in the directory index, and the caller should fall back to
324 * searching the directory normally. The callers of dx_probe **MUST**
325 * check for this error code, and make sure it never gets reflected
326 * back to userspace.
327 */
328static struct dx_frame *
329dx_probe(struct qstr *entry, struct inode *dir,
330 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
331{
332 unsigned count, indirect;
333 struct dx_entry *at, *entries, *p, *q, *m;
334 struct dx_root *root;
335 struct buffer_head *bh;
336 struct dx_frame *frame = frame_in;
337 u32 hash;
338
339 frame->bh = NULL;
340 if (!(bh = ext3_dir_bread(NULL, dir, 0, 0, err))) {
341 *err = ERR_BAD_DX_DIR;
342 goto fail;
343 }
344 root = (struct dx_root *) bh->b_data;
345 if (root->info.hash_version != DX_HASH_TEA &&
346 root->info.hash_version != DX_HASH_HALF_MD4 &&
347 root->info.hash_version != DX_HASH_LEGACY) {
348 ext3_warning(dir->i_sb, __func__,
349 "Unrecognised inode hash code %d",
350 root->info.hash_version);
351 brelse(bh);
352 *err = ERR_BAD_DX_DIR;
353 goto fail;
354 }
355 hinfo->hash_version = root->info.hash_version;
356 if (hinfo->hash_version <= DX_HASH_TEA)
357 hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
358 hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
359 if (entry)
360 ext3fs_dirhash(entry->name, entry->len, hinfo);
361 hash = hinfo->hash;
362
363 if (root->info.unused_flags & 1) {
364 ext3_warning(dir->i_sb, __func__,
365 "Unimplemented inode hash flags: %#06x",
366 root->info.unused_flags);
367 brelse(bh);
368 *err = ERR_BAD_DX_DIR;
369 goto fail;
370 }
371
372 if ((indirect = root->info.indirect_levels) > 1) {
373 ext3_warning(dir->i_sb, __func__,
374 "Unimplemented inode hash depth: %#06x",
375 root->info.indirect_levels);
376 brelse(bh);
377 *err = ERR_BAD_DX_DIR;
378 goto fail;
379 }
380
381 entries = (struct dx_entry *) (((char *)&root->info) +
382 root->info.info_length);
383
384 if (dx_get_limit(entries) != dx_root_limit(dir,
385 root->info.info_length)) {
386 ext3_warning(dir->i_sb, __func__,
387 "dx entry: limit != root limit");
388 brelse(bh);
389 *err = ERR_BAD_DX_DIR;
390 goto fail;
391 }
392
393 dxtrace (printk("Look up %x", hash));
394 while (1)
395 {
396 count = dx_get_count(entries);
397 if (!count || count > dx_get_limit(entries)) {
398 ext3_warning(dir->i_sb, __func__,
399 "dx entry: no count or count > limit");
400 brelse(bh);
401 *err = ERR_BAD_DX_DIR;
402 goto fail2;
403 }
404
405 p = entries + 1;
406 q = entries + count - 1;
407 while (p <= q)
408 {
409 m = p + (q - p)/2;
410 dxtrace(printk("."));
411 if (dx_get_hash(m) > hash)
412 q = m - 1;
413 else
414 p = m + 1;
415 }
416
417 if (0) // linear search cross check
418 {
419 unsigned n = count - 1;
420 at = entries;
421 while (n--)
422 {
423 dxtrace(printk(","));
424 if (dx_get_hash(++at) > hash)
425 {
426 at--;
427 break;
428 }
429 }
430 assert (at == p - 1);
431 }
432
433 at = p - 1;
434 dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
435 frame->bh = bh;
436 frame->entries = entries;
437 frame->at = at;
438 if (!indirect--) return frame;
439 if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(at), 0, err))) {
440 *err = ERR_BAD_DX_DIR;
441 goto fail2;
442 }
443 at = entries = ((struct dx_node *) bh->b_data)->entries;
444 if (dx_get_limit(entries) != dx_node_limit (dir)) {
445 ext3_warning(dir->i_sb, __func__,
446 "dx entry: limit != node limit");
447 brelse(bh);
448 *err = ERR_BAD_DX_DIR;
449 goto fail2;
450 }
451 frame++;
452 frame->bh = NULL;
453 }
454fail2:
455 while (frame >= frame_in) {
456 brelse(frame->bh);
457 frame--;
458 }
459fail:
460 if (*err == ERR_BAD_DX_DIR)
461 ext3_warning(dir->i_sb, __func__,
462 "Corrupt dir inode %ld, running e2fsck is "
463 "recommended.", dir->i_ino);
464 return NULL;
465}
466
467static void dx_release (struct dx_frame *frames)
468{
469 if (frames[0].bh == NULL)
470 return;
471
472 if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
473 brelse(frames[1].bh);
474 brelse(frames[0].bh);
475}
476
477/*
478 * This function increments the frame pointer to search the next leaf
479 * block, and reads in the necessary intervening nodes if the search
480 * should be necessary. Whether or not the search is necessary is
481 * controlled by the hash parameter. If the hash value is even, then
482 * the search is only continued if the next block starts with that
483 * hash value. This is used if we are searching for a specific file.
484 *
485 * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
486 *
487 * This function returns 1 if the caller should continue to search,
488 * or 0 if it should not. If there is an error reading one of the
489 * index blocks, it will a negative error code.
490 *
491 * If start_hash is non-null, it will be filled in with the starting
492 * hash of the next page.
493 */
494static int ext3_htree_next_block(struct inode *dir, __u32 hash,
495 struct dx_frame *frame,
496 struct dx_frame *frames,
497 __u32 *start_hash)
498{
499 struct dx_frame *p;
500 struct buffer_head *bh;
501 int err, num_frames = 0;
502 __u32 bhash;
503
504 p = frame;
505 /*
506 * Find the next leaf page by incrementing the frame pointer.
507 * If we run out of entries in the interior node, loop around and
508 * increment pointer in the parent node. When we break out of
509 * this loop, num_frames indicates the number of interior
510 * nodes need to be read.
511 */
512 while (1) {
513 if (++(p->at) < p->entries + dx_get_count(p->entries))
514 break;
515 if (p == frames)
516 return 0;
517 num_frames++;
518 p--;
519 }
520
521 /*
522 * If the hash is 1, then continue only if the next page has a
523 * continuation hash of any value. This is used for readdir
524 * handling. Otherwise, check to see if the hash matches the
525 * desired contiuation hash. If it doesn't, return since
526 * there's no point to read in the successive index pages.
527 */
528 bhash = dx_get_hash(p->at);
529 if (start_hash)
530 *start_hash = bhash;
531 if ((hash & 1) == 0) {
532 if ((bhash & ~1) != hash)
533 return 0;
534 }
535 /*
536 * If the hash is HASH_NB_ALWAYS, we always go to the next
537 * block so no check is necessary
538 */
539 while (num_frames--) {
540 if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(p->at),
541 0, &err)))
542 return err; /* Failure */
543 p++;
544 brelse (p->bh);
545 p->bh = bh;
546 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
547 }
548 return 1;
549}
550
551
552/*
553 * This function fills a red-black tree with information from a
554 * directory block. It returns the number directory entries loaded
555 * into the tree. If there is an error it is returned in err.
556 */
557static int htree_dirblock_to_tree(struct file *dir_file,
558 struct inode *dir, int block,
559 struct dx_hash_info *hinfo,
560 __u32 start_hash, __u32 start_minor_hash)
561{
562 struct buffer_head *bh;
563 struct ext3_dir_entry_2 *de, *top;
564 int err = 0, count = 0;
565
566 dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
567
568 if (!(bh = ext3_dir_bread(NULL, dir, block, 0, &err)))
569 return err;
570
571 de = (struct ext3_dir_entry_2 *) bh->b_data;
572 top = (struct ext3_dir_entry_2 *) ((char *) de +
573 dir->i_sb->s_blocksize -
574 EXT3_DIR_REC_LEN(0));
575 for (; de < top; de = ext3_next_entry(de)) {
576 if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
577 (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
578 +((char *)de - bh->b_data))) {
579 /* silently ignore the rest of the block */
580 break;
581 }
582 ext3fs_dirhash(de->name, de->name_len, hinfo);
583 if ((hinfo->hash < start_hash) ||
584 ((hinfo->hash == start_hash) &&
585 (hinfo->minor_hash < start_minor_hash)))
586 continue;
587 if (de->inode == 0)
588 continue;
589 if ((err = ext3_htree_store_dirent(dir_file,
590 hinfo->hash, hinfo->minor_hash, de)) != 0) {
591 brelse(bh);
592 return err;
593 }
594 count++;
595 }
596 brelse(bh);
597 return count;
598}
599
600
601/*
602 * This function fills a red-black tree with information from a
603 * directory. We start scanning the directory in hash order, starting
604 * at start_hash and start_minor_hash.
605 *
606 * This function returns the number of entries inserted into the tree,
607 * or a negative error code.
608 */
609int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
610 __u32 start_minor_hash, __u32 *next_hash)
611{
612 struct dx_hash_info hinfo;
613 struct ext3_dir_entry_2 *de;
614 struct dx_frame frames[2], *frame;
615 struct inode *dir;
616 int block, err;
617 int count = 0;
618 int ret;
619 __u32 hashval;
620
621 dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
622 start_minor_hash));
623 dir = file_inode(dir_file);
624 if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
625 hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
626 if (hinfo.hash_version <= DX_HASH_TEA)
627 hinfo.hash_version +=
628 EXT3_SB(dir->i_sb)->s_hash_unsigned;
629 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
630 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
631 start_hash, start_minor_hash);
632 *next_hash = ~0;
633 return count;
634 }
635 hinfo.hash = start_hash;
636 hinfo.minor_hash = 0;
637 frame = dx_probe(NULL, file_inode(dir_file), &hinfo, frames, &err);
638 if (!frame)
639 return err;
640
641 /* Add '.' and '..' from the htree header */
642 if (!start_hash && !start_minor_hash) {
643 de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
644 if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
645 goto errout;
646 count++;
647 }
648 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
649 de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
650 de = ext3_next_entry(de);
651 if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0)
652 goto errout;
653 count++;
654 }
655
656 while (1) {
657 block = dx_get_block(frame->at);
658 ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
659 start_hash, start_minor_hash);
660 if (ret < 0) {
661 err = ret;
662 goto errout;
663 }
664 count += ret;
665 hashval = ~0;
666 ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
667 frame, frames, &hashval);
668 *next_hash = hashval;
669 if (ret < 0) {
670 err = ret;
671 goto errout;
672 }
673 /*
674 * Stop if: (a) there are no more entries, or
675 * (b) we have inserted at least one entry and the
676 * next hash value is not a continuation
677 */
678 if ((ret == 0) ||
679 (count && ((hashval & 1) == 0)))
680 break;
681 }
682 dx_release(frames);
683 dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
684 count, *next_hash));
685 return count;
686errout:
687 dx_release(frames);
688 return (err);
689}
690
691
692/*
693 * Directory block splitting, compacting
694 */
695
696/*
697 * Create map of hash values, offsets, and sizes, stored at end of block.
698 * Returns number of entries mapped.
699 */
700static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
701 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
702{
703 int count = 0;
704 char *base = (char *) de;
705 struct dx_hash_info h = *hinfo;
706
707 while ((char *) de < base + blocksize)
708 {
709 if (de->name_len && de->inode) {
710 ext3fs_dirhash(de->name, de->name_len, &h);
711 map_tail--;
712 map_tail->hash = h.hash;
713 map_tail->offs = (u16) ((char *) de - base);
714 map_tail->size = le16_to_cpu(de->rec_len);
715 count++;
716 cond_resched();
717 }
718 /* XXX: do we need to check rec_len == 0 case? -Chris */
719 de = ext3_next_entry(de);
720 }
721 return count;
722}
723
724/* Sort map by hash value */
725static void dx_sort_map (struct dx_map_entry *map, unsigned count)
726{
727 struct dx_map_entry *p, *q, *top = map + count - 1;
728 int more;
729 /* Combsort until bubble sort doesn't suck */
730 while (count > 2)
731 {
732 count = count*10/13;
733 if (count - 9 < 2) /* 9, 10 -> 11 */
734 count = 11;
735 for (p = top, q = p - count; q >= map; p--, q--)
736 if (p->hash < q->hash)
737 swap(*p, *q);
738 }
739 /* Garden variety bubble sort */
740 do {
741 more = 0;
742 q = top;
743 while (q-- > map)
744 {
745 if (q[1].hash >= q[0].hash)
746 continue;
747 swap(*(q+1), *q);
748 more = 1;
749 }
750 } while(more);
751}
752
753static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
754{
755 struct dx_entry *entries = frame->entries;
756 struct dx_entry *old = frame->at, *new = old + 1;
757 int count = dx_get_count(entries);
758
759 assert(count < dx_get_limit(entries));
760 assert(old < entries + count);
761 memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
762 dx_set_hash(new, hash);
763 dx_set_block(new, block);
764 dx_set_count(entries, count + 1);
765}
766
767static void ext3_update_dx_flag(struct inode *inode)
768{
769 if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
770 EXT3_FEATURE_COMPAT_DIR_INDEX))
771 EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
772}
773
774/*
775 * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
776 *
777 * `len <= EXT3_NAME_LEN' is guaranteed by caller.
778 * `de != NULL' is guaranteed by caller.
779 */
780static inline int ext3_match (int len, const char * const name,
781 struct ext3_dir_entry_2 * de)
782{
783 if (len != de->name_len)
784 return 0;
785 if (!de->inode)
786 return 0;
787 return !memcmp(name, de->name, len);
788}
789
790/*
791 * Returns 0 if not found, -1 on failure, and 1 on success
792 */
793static inline int search_dirblock(struct buffer_head * bh,
794 struct inode *dir,
795 struct qstr *child,
796 unsigned long offset,
797 struct ext3_dir_entry_2 ** res_dir)
798{
799 struct ext3_dir_entry_2 * de;
800 char * dlimit;
801 int de_len;
802 const char *name = child->name;
803 int namelen = child->len;
804
805 de = (struct ext3_dir_entry_2 *) bh->b_data;
806 dlimit = bh->b_data + dir->i_sb->s_blocksize;
807 while ((char *) de < dlimit) {
808 /* this code is executed quadratically often */
809 /* do minimal checking `by hand' */
810
811 if ((char *) de + namelen <= dlimit &&
812 ext3_match (namelen, name, de)) {
813 /* found a match - just to be sure, do a full check */
814 if (!ext3_check_dir_entry("ext3_find_entry",
815 dir, de, bh, offset))
816 return -1;
817 *res_dir = de;
818 return 1;
819 }
820 /* prevent looping on a bad block */
821 de_len = ext3_rec_len_from_disk(de->rec_len);
822 if (de_len <= 0)
823 return -1;
824 offset += de_len;
825 de = (struct ext3_dir_entry_2 *) ((char *) de + de_len);
826 }
827 return 0;
828}
829
830
831/*
832 * ext3_find_entry()
833 *
834 * finds an entry in the specified directory with the wanted name. It
835 * returns the cache buffer in which the entry was found, and the entry
836 * itself (as a parameter - res_dir). It does NOT read the inode of the
837 * entry - you'll have to do that yourself if you want to.
838 *
839 * The returned buffer_head has ->b_count elevated. The caller is expected
840 * to brelse() it when appropriate.
841 */
842static struct buffer_head *ext3_find_entry(struct inode *dir,
843 struct qstr *entry,
844 struct ext3_dir_entry_2 **res_dir)
845{
846 struct super_block * sb;
847 struct buffer_head * bh_use[NAMEI_RA_SIZE];
848 struct buffer_head * bh, *ret = NULL;
849 unsigned long start, block, b;
850 const u8 *name = entry->name;
851 int ra_max = 0; /* Number of bh's in the readahead
852 buffer, bh_use[] */
853 int ra_ptr = 0; /* Current index into readahead
854 buffer */
855 int num = 0;
856 int nblocks, i, err;
857 int namelen;
858
859 *res_dir = NULL;
860 sb = dir->i_sb;
861 namelen = entry->len;
862 if (namelen > EXT3_NAME_LEN)
863 return NULL;
864 if ((namelen <= 2) && (name[0] == '.') &&
865 (name[1] == '.' || name[1] == 0)) {
866 /*
867 * "." or ".." will only be in the first block
868 * NFS may look up ".."; "." should be handled by the VFS
869 */
870 block = start = 0;
871 nblocks = 1;
872 goto restart;
873 }
874 if (is_dx(dir)) {
875 bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
876 /*
877 * On success, or if the error was file not found,
878 * return. Otherwise, fall back to doing a search the
879 * old fashioned way.
880 */
881 if (bh || (err != ERR_BAD_DX_DIR))
882 return bh;
883 dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
884 }
885 nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
886 start = EXT3_I(dir)->i_dir_start_lookup;
887 if (start >= nblocks)
888 start = 0;
889 block = start;
890restart:
891 do {
892 /*
893 * We deal with the read-ahead logic here.
894 */
895 if (ra_ptr >= ra_max) {
896 /* Refill the readahead buffer */
897 ra_ptr = 0;
898 b = block;
899 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
900 /*
901 * Terminate if we reach the end of the
902 * directory and must wrap, or if our
903 * search has finished at this block.
904 */
905 if (b >= nblocks || (num && block == start)) {
906 bh_use[ra_max] = NULL;
907 break;
908 }
909 num++;
910 bh = ext3_getblk(NULL, dir, b++, 0, &err);
911 bh_use[ra_max] = bh;
912 if (bh && !bh_uptodate_or_lock(bh)) {
913 get_bh(bh);
914 bh->b_end_io = end_buffer_read_sync;
915 submit_bh(READ | REQ_META | REQ_PRIO,
916 bh);
917 }
918 }
919 }
920 if ((bh = bh_use[ra_ptr++]) == NULL)
921 goto next;
922 wait_on_buffer(bh);
923 if (!buffer_uptodate(bh)) {
924 /* read error, skip block & hope for the best */
925 ext3_error(sb, __func__, "reading directory #%lu "
926 "offset %lu", dir->i_ino, block);
927 brelse(bh);
928 goto next;
929 }
930 i = search_dirblock(bh, dir, entry,
931 block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
932 if (i == 1) {
933 EXT3_I(dir)->i_dir_start_lookup = block;
934 ret = bh;
935 goto cleanup_and_exit;
936 } else {
937 brelse(bh);
938 if (i < 0)
939 goto cleanup_and_exit;
940 }
941 next:
942 if (++block >= nblocks)
943 block = 0;
944 } while (block != start);
945
946 /*
947 * If the directory has grown while we were searching, then
948 * search the last part of the directory before giving up.
949 */
950 block = nblocks;
951 nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
952 if (block < nblocks) {
953 start = 0;
954 goto restart;
955 }
956
957cleanup_and_exit:
958 /* Clean up the read-ahead blocks */
959 for (; ra_ptr < ra_max; ra_ptr++)
960 brelse (bh_use[ra_ptr]);
961 return ret;
962}
963
964static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
965 struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
966 int *err)
967{
968 struct super_block *sb = dir->i_sb;
969 struct dx_hash_info hinfo;
970 struct dx_frame frames[2], *frame;
971 struct buffer_head *bh;
972 unsigned long block;
973 int retval;
974
975 if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
976 return NULL;
977 do {
978 block = dx_get_block(frame->at);
979 if (!(bh = ext3_dir_bread (NULL, dir, block, 0, err)))
980 goto errout;
981
982 retval = search_dirblock(bh, dir, entry,
983 block << EXT3_BLOCK_SIZE_BITS(sb),
984 res_dir);
985 if (retval == 1) {
986 dx_release(frames);
987 return bh;
988 }
989 brelse(bh);
990 if (retval == -1) {
991 *err = ERR_BAD_DX_DIR;
992 goto errout;
993 }
994
995 /* Check to see if we should continue to search */
996 retval = ext3_htree_next_block(dir, hinfo.hash, frame,
997 frames, NULL);
998 if (retval < 0) {
999 ext3_warning(sb, __func__,
1000 "error reading index page in directory #%lu",
1001 dir->i_ino);
1002 *err = retval;
1003 goto errout;
1004 }
1005 } while (retval == 1);
1006
1007 *err = -ENOENT;
1008errout:
1009 dxtrace(printk("%s not found\n", entry->name));
1010 dx_release (frames);
1011 return NULL;
1012}
1013
1014static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags)
1015{
1016 struct inode * inode;
1017 struct ext3_dir_entry_2 * de;
1018 struct buffer_head * bh;
1019
1020 if (dentry->d_name.len > EXT3_NAME_LEN)
1021 return ERR_PTR(-ENAMETOOLONG);
1022
1023 bh = ext3_find_entry(dir, &dentry->d_name, &de);
1024 inode = NULL;
1025 if (bh) {
1026 unsigned long ino = le32_to_cpu(de->inode);
1027 brelse (bh);
1028 if (!ext3_valid_inum(dir->i_sb, ino)) {
1029 ext3_error(dir->i_sb, "ext3_lookup",
1030 "bad inode number: %lu", ino);
1031 return ERR_PTR(-EIO);
1032 }
1033 inode = ext3_iget(dir->i_sb, ino);
1034 if (inode == ERR_PTR(-ESTALE)) {
1035 ext3_error(dir->i_sb, __func__,
1036 "deleted inode referenced: %lu",
1037 ino);
1038 return ERR_PTR(-EIO);
1039 }
1040 }
1041 return d_splice_alias(inode, dentry);
1042}
1043
1044
1045struct dentry *ext3_get_parent(struct dentry *child)
1046{
1047 unsigned long ino;
1048 struct qstr dotdot = QSTR_INIT("..", 2);
1049 struct ext3_dir_entry_2 * de;
1050 struct buffer_head *bh;
1051
1052 bh = ext3_find_entry(d_inode(child), &dotdot, &de);
1053 if (!bh)
1054 return ERR_PTR(-ENOENT);
1055 ino = le32_to_cpu(de->inode);
1056 brelse(bh);
1057
1058 if (!ext3_valid_inum(d_inode(child)->i_sb, ino)) {
1059 ext3_error(d_inode(child)->i_sb, "ext3_get_parent",
1060 "bad inode number: %lu", ino);
1061 return ERR_PTR(-EIO);
1062 }
1063
1064 return d_obtain_alias(ext3_iget(d_inode(child)->i_sb, ino));
1065}
1066
1067#define S_SHIFT 12
1068static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
1069 [S_IFREG >> S_SHIFT] = EXT3_FT_REG_FILE,
1070 [S_IFDIR >> S_SHIFT] = EXT3_FT_DIR,
1071 [S_IFCHR >> S_SHIFT] = EXT3_FT_CHRDEV,
1072 [S_IFBLK >> S_SHIFT] = EXT3_FT_BLKDEV,
1073 [S_IFIFO >> S_SHIFT] = EXT3_FT_FIFO,
1074 [S_IFSOCK >> S_SHIFT] = EXT3_FT_SOCK,
1075 [S_IFLNK >> S_SHIFT] = EXT3_FT_SYMLINK,
1076};
1077
1078static inline void ext3_set_de_type(struct super_block *sb,
1079 struct ext3_dir_entry_2 *de,
1080 umode_t mode) {
1081 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE))
1082 de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1083}
1084
1085/*
1086 * Move count entries from end of map between two memory locations.
1087 * Returns pointer to last entry moved.
1088 */
1089static struct ext3_dir_entry_2 *
1090dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1091{
1092 unsigned rec_len = 0;
1093
1094 while (count--) {
1095 struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
1096 rec_len = EXT3_DIR_REC_LEN(de->name_len);
1097 memcpy (to, de, rec_len);
1098 ((struct ext3_dir_entry_2 *) to)->rec_len =
1099 ext3_rec_len_to_disk(rec_len);
1100 de->inode = 0;
1101 map++;
1102 to += rec_len;
1103 }
1104 return (struct ext3_dir_entry_2 *) (to - rec_len);
1105}
1106
1107/*
1108 * Compact each dir entry in the range to the minimal rec_len.
1109 * Returns pointer to last entry in range.
1110 */
1111static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
1112{
1113 struct ext3_dir_entry_2 *next, *to, *prev;
1114 struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
1115 unsigned rec_len = 0;
1116
1117 prev = to = de;
1118 while ((char *)de < base + blocksize) {
1119 next = ext3_next_entry(de);
1120 if (de->inode && de->name_len) {
1121 rec_len = EXT3_DIR_REC_LEN(de->name_len);
1122 if (de > to)
1123 memmove(to, de, rec_len);
1124 to->rec_len = ext3_rec_len_to_disk(rec_len);
1125 prev = to;
1126 to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
1127 }
1128 de = next;
1129 }
1130 return prev;
1131}
1132
1133/*
1134 * Split a full leaf block to make room for a new dir entry.
1135 * Allocate a new block, and move entries so that they are approx. equally full.
1136 * Returns pointer to de in block into which the new entry will be inserted.
1137 */
1138static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1139 struct buffer_head **bh,struct dx_frame *frame,
1140 struct dx_hash_info *hinfo, int *error)
1141{
1142 unsigned blocksize = dir->i_sb->s_blocksize;
1143 unsigned count, continued;
1144 struct buffer_head *bh2;
1145 u32 newblock;
1146 u32 hash2;
1147 struct dx_map_entry *map;
1148 char *data1 = (*bh)->b_data, *data2;
1149 unsigned split, move, size;
1150 struct ext3_dir_entry_2 *de = NULL, *de2;
1151 int err = 0, i;
1152
1153 bh2 = ext3_append (handle, dir, &newblock, &err);
1154 if (!(bh2)) {
1155 brelse(*bh);
1156 *bh = NULL;
1157 goto errout;
1158 }
1159
1160 BUFFER_TRACE(*bh, "get_write_access");
1161 err = ext3_journal_get_write_access(handle, *bh);
1162 if (err)
1163 goto journal_error;
1164
1165 BUFFER_TRACE(frame->bh, "get_write_access");
1166 err = ext3_journal_get_write_access(handle, frame->bh);
1167 if (err)
1168 goto journal_error;
1169
1170 data2 = bh2->b_data;
1171
1172 /* create map in the end of data2 block */
1173 map = (struct dx_map_entry *) (data2 + blocksize);
1174 count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
1175 blocksize, hinfo, map);
1176 map -= count;
1177 dx_sort_map (map, count);
1178 /* Split the existing block in the middle, size-wise */
1179 size = 0;
1180 move = 0;
1181 for (i = count-1; i >= 0; i--) {
1182 /* is more than half of this entry in 2nd half of the block? */
1183 if (size + map[i].size/2 > blocksize/2)
1184 break;
1185 size += map[i].size;
1186 move++;
1187 }
1188 /* map index at which we will split */
1189 split = count - move;
1190 hash2 = map[split].hash;
1191 continued = hash2 == map[split - 1].hash;
1192 dxtrace(printk("Split block %i at %x, %i/%i\n",
1193 dx_get_block(frame->at), hash2, split, count-split));
1194
1195 /* Fancy dance to stay within two buffers */
1196 de2 = dx_move_dirents(data1, data2, map + split, count - split);
1197 de = dx_pack_dirents(data1,blocksize);
1198 de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
1199 de2->rec_len = ext3_rec_len_to_disk(data2 + blocksize - (char *) de2);
1200 dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
1201 dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
1202
1203 /* Which block gets the new entry? */
1204 if (hinfo->hash >= hash2)
1205 {
1206 swap(*bh, bh2);
1207 de = de2;
1208 }
1209 dx_insert_block (frame, hash2 + continued, newblock);
1210 err = ext3_journal_dirty_metadata (handle, bh2);
1211 if (err)
1212 goto journal_error;
1213 err = ext3_journal_dirty_metadata (handle, frame->bh);
1214 if (err)
1215 goto journal_error;
1216 brelse (bh2);
1217 dxtrace(dx_show_index ("frame", frame->entries));
1218 return de;
1219
1220journal_error:
1221 brelse(*bh);
1222 brelse(bh2);
1223 *bh = NULL;
1224 ext3_std_error(dir->i_sb, err);
1225errout:
1226 *error = err;
1227 return NULL;
1228}
1229
1230
1231/*
1232 * Add a new entry into a directory (leaf) block. If de is non-NULL,
1233 * it points to a directory entry which is guaranteed to be large
1234 * enough for new directory entry. If de is NULL, then
1235 * add_dirent_to_buf will attempt search the directory block for
1236 * space. It will return -ENOSPC if no space is available, and -EIO
1237 * and -EEXIST if directory entry already exists.
1238 *
1239 * NOTE! bh is NOT released in the case where ENOSPC is returned. In
1240 * all other cases bh is released.
1241 */
1242static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1243 struct inode *inode, struct ext3_dir_entry_2 *de,
1244 struct buffer_head * bh)
1245{
1246 struct inode *dir = d_inode(dentry->d_parent);
1247 const char *name = dentry->d_name.name;
1248 int namelen = dentry->d_name.len;
1249 unsigned long offset = 0;
1250 unsigned short reclen;
1251 int nlen, rlen, err;
1252 char *top;
1253
1254 reclen = EXT3_DIR_REC_LEN(namelen);
1255 if (!de) {
1256 de = (struct ext3_dir_entry_2 *)bh->b_data;
1257 top = bh->b_data + dir->i_sb->s_blocksize - reclen;
1258 while ((char *) de <= top) {
1259 if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
1260 bh, offset)) {
1261 brelse (bh);
1262 return -EIO;
1263 }
1264 if (ext3_match (namelen, name, de)) {
1265 brelse (bh);
1266 return -EEXIST;
1267 }
1268 nlen = EXT3_DIR_REC_LEN(de->name_len);
1269 rlen = ext3_rec_len_from_disk(de->rec_len);
1270 if ((de->inode? rlen - nlen: rlen) >= reclen)
1271 break;
1272 de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
1273 offset += rlen;
1274 }
1275 if ((char *) de > top)
1276 return -ENOSPC;
1277 }
1278 BUFFER_TRACE(bh, "get_write_access");
1279 err = ext3_journal_get_write_access(handle, bh);
1280 if (err) {
1281 ext3_std_error(dir->i_sb, err);
1282 brelse(bh);
1283 return err;
1284 }
1285
1286 /* By now the buffer is marked for journaling */
1287 nlen = EXT3_DIR_REC_LEN(de->name_len);
1288 rlen = ext3_rec_len_from_disk(de->rec_len);
1289 if (de->inode) {
1290 struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
1291 de1->rec_len = ext3_rec_len_to_disk(rlen - nlen);
1292 de->rec_len = ext3_rec_len_to_disk(nlen);
1293 de = de1;
1294 }
1295 de->file_type = EXT3_FT_UNKNOWN;
1296 if (inode) {
1297 de->inode = cpu_to_le32(inode->i_ino);
1298 ext3_set_de_type(dir->i_sb, de, inode->i_mode);
1299 } else
1300 de->inode = 0;
1301 de->name_len = namelen;
1302 memcpy (de->name, name, namelen);
1303 /*
1304 * XXX shouldn't update any times until successful
1305 * completion of syscall, but too many callers depend
1306 * on this.
1307 *
1308 * XXX similarly, too many callers depend on
1309 * ext3_new_inode() setting the times, but error
1310 * recovery deletes the inode, so the worst that can
1311 * happen is that the times are slightly out of date
1312 * and/or different from the directory change time.
1313 */
1314 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
1315 ext3_update_dx_flag(dir);
1316 dir->i_version++;
1317 ext3_mark_inode_dirty(handle, dir);
1318 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1319 err = ext3_journal_dirty_metadata(handle, bh);
1320 if (err)
1321 ext3_std_error(dir->i_sb, err);
1322 brelse(bh);
1323 return 0;
1324}
1325
1326/*
1327 * This converts a one block unindexed directory to a 3 block indexed
1328 * directory, and adds the dentry to the indexed directory.
1329 */
1330static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1331 struct inode *inode, struct buffer_head *bh)
1332{
1333 struct inode *dir = d_inode(dentry->d_parent);
1334 const char *name = dentry->d_name.name;
1335 int namelen = dentry->d_name.len;
1336 struct buffer_head *bh2;
1337 struct dx_root *root;
1338 struct dx_frame frames[2], *frame;
1339 struct dx_entry *entries;
1340 struct ext3_dir_entry_2 *de, *de2;
1341 char *data1, *top;
1342 unsigned len;
1343 int retval;
1344 unsigned blocksize;
1345 struct dx_hash_info hinfo;
1346 u32 block;
1347 struct fake_dirent *fde;
1348
1349 blocksize = dir->i_sb->s_blocksize;
1350 dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
1351 retval = ext3_journal_get_write_access(handle, bh);
1352 if (retval) {
1353 ext3_std_error(dir->i_sb, retval);
1354 brelse(bh);
1355 return retval;
1356 }
1357 root = (struct dx_root *) bh->b_data;
1358
1359 /* The 0th block becomes the root, move the dirents out */
1360 fde = &root->dotdot;
1361 de = (struct ext3_dir_entry_2 *)((char *)fde +
1362 ext3_rec_len_from_disk(fde->rec_len));
1363 if ((char *) de >= (((char *) root) + blocksize)) {
1364 ext3_error(dir->i_sb, __func__,
1365 "invalid rec_len for '..' in inode %lu",
1366 dir->i_ino);
1367 brelse(bh);
1368 return -EIO;
1369 }
1370 len = ((char *) root) + blocksize - (char *) de;
1371
1372 bh2 = ext3_append (handle, dir, &block, &retval);
1373 if (!(bh2)) {
1374 brelse(bh);
1375 return retval;
1376 }
1377 EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
1378 data1 = bh2->b_data;
1379
1380 memcpy (data1, de, len);
1381 de = (struct ext3_dir_entry_2 *) data1;
1382 top = data1 + len;
1383 while ((char *)(de2 = ext3_next_entry(de)) < top)
1384 de = de2;
1385 de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
1386 /* Initialize the root; the dot dirents already exist */
1387 de = (struct ext3_dir_entry_2 *) (&root->dotdot);
1388 de->rec_len = ext3_rec_len_to_disk(blocksize - EXT3_DIR_REC_LEN(2));
1389 memset (&root->info, 0, sizeof(root->info));
1390 root->info.info_length = sizeof(root->info);
1391 root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
1392 entries = root->entries;
1393 dx_set_block (entries, 1);
1394 dx_set_count (entries, 1);
1395 dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
1396
1397 /* Initialize as for dx_probe */
1398 hinfo.hash_version = root->info.hash_version;
1399 if (hinfo.hash_version <= DX_HASH_TEA)
1400 hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
1401 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
1402 ext3fs_dirhash(name, namelen, &hinfo);
1403 frame = frames;
1404 frame->entries = entries;
1405 frame->at = entries;
1406 frame->bh = bh;
1407 bh = bh2;
1408 /*
1409 * Mark buffers dirty here so that if do_split() fails we write a
1410 * consistent set of buffers to disk.
1411 */
1412 ext3_journal_dirty_metadata(handle, frame->bh);
1413 ext3_journal_dirty_metadata(handle, bh);
1414 de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1415 if (!de) {
1416 ext3_mark_inode_dirty(handle, dir);
1417 dx_release(frames);
1418 return retval;
1419 }
1420 dx_release(frames);
1421
1422 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1423}
1424
1425/*
1426 * ext3_add_entry()
1427 *
1428 * adds a file entry to the specified directory, using the same
1429 * semantics as ext3_find_entry(). It returns NULL if it failed.
1430 *
1431 * NOTE!! The inode part of 'de' is left at 0 - which means you
1432 * may not sleep between calling this and putting something into
1433 * the entry, as someone else might have used it while you slept.
1434 */
1435static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
1436 struct inode *inode)
1437{
1438 struct inode *dir = d_inode(dentry->d_parent);
1439 struct buffer_head * bh;
1440 struct ext3_dir_entry_2 *de;
1441 struct super_block * sb;
1442 int retval;
1443 int dx_fallback=0;
1444 unsigned blocksize;
1445 u32 block, blocks;
1446
1447 sb = dir->i_sb;
1448 blocksize = sb->s_blocksize;
1449 if (!dentry->d_name.len)
1450 return -EINVAL;
1451 if (is_dx(dir)) {
1452 retval = ext3_dx_add_entry(handle, dentry, inode);
1453 if (!retval || (retval != ERR_BAD_DX_DIR))
1454 return retval;
1455 EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
1456 dx_fallback++;
1457 ext3_mark_inode_dirty(handle, dir);
1458 }
1459 blocks = dir->i_size >> sb->s_blocksize_bits;
1460 for (block = 0; block < blocks; block++) {
1461 if (!(bh = ext3_dir_bread(handle, dir, block, 0, &retval)))
1462 return retval;
1463
1464 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1465 if (retval != -ENOSPC)
1466 return retval;
1467
1468 if (blocks == 1 && !dx_fallback &&
1469 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
1470 return make_indexed_dir(handle, dentry, inode, bh);
1471 brelse(bh);
1472 }
1473 bh = ext3_append(handle, dir, &block, &retval);
1474 if (!bh)
1475 return retval;
1476 de = (struct ext3_dir_entry_2 *) bh->b_data;
1477 de->inode = 0;
1478 de->rec_len = ext3_rec_len_to_disk(blocksize);
1479 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1480}
1481
1482/*
1483 * Returns 0 for success, or a negative error value
1484 */
1485static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
1486 struct inode *inode)
1487{
1488 struct dx_frame frames[2], *frame;
1489 struct dx_entry *entries, *at;
1490 struct dx_hash_info hinfo;
1491 struct buffer_head * bh;
1492 struct inode *dir = d_inode(dentry->d_parent);
1493 struct super_block * sb = dir->i_sb;
1494 struct ext3_dir_entry_2 *de;
1495 int err;
1496
1497 frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
1498 if (!frame)
1499 return err;
1500 entries = frame->entries;
1501 at = frame->at;
1502
1503 if (!(bh = ext3_dir_bread(handle, dir, dx_get_block(frame->at), 0, &err)))
1504 goto cleanup;
1505
1506 BUFFER_TRACE(bh, "get_write_access");
1507 err = ext3_journal_get_write_access(handle, bh);
1508 if (err)
1509 goto journal_error;
1510
1511 err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1512 if (err != -ENOSPC) {
1513 bh = NULL;
1514 goto cleanup;
1515 }
1516
1517 /* Block full, should compress but for now just split */
1518 dxtrace(printk("using %u of %u node entries\n",
1519 dx_get_count(entries), dx_get_limit(entries)));
1520 /* Need to split index? */
1521 if (dx_get_count(entries) == dx_get_limit(entries)) {
1522 u32 newblock;
1523 unsigned icount = dx_get_count(entries);
1524 int levels = frame - frames;
1525 struct dx_entry *entries2;
1526 struct dx_node *node2;
1527 struct buffer_head *bh2;
1528
1529 if (levels && (dx_get_count(frames->entries) ==
1530 dx_get_limit(frames->entries))) {
1531 ext3_warning(sb, __func__,
1532 "Directory index full!");
1533 err = -ENOSPC;
1534 goto cleanup;
1535 }
1536 bh2 = ext3_append (handle, dir, &newblock, &err);
1537 if (!(bh2))
1538 goto cleanup;
1539 node2 = (struct dx_node *)(bh2->b_data);
1540 entries2 = node2->entries;
1541 memset(&node2->fake, 0, sizeof(struct fake_dirent));
1542 node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
1543 BUFFER_TRACE(frame->bh, "get_write_access");
1544 err = ext3_journal_get_write_access(handle, frame->bh);
1545 if (err)
1546 goto journal_error;
1547 if (levels) {
1548 unsigned icount1 = icount/2, icount2 = icount - icount1;
1549 unsigned hash2 = dx_get_hash(entries + icount1);
1550 dxtrace(printk("Split index %i/%i\n", icount1, icount2));
1551
1552 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1553 err = ext3_journal_get_write_access(handle,
1554 frames[0].bh);
1555 if (err)
1556 goto journal_error;
1557
1558 memcpy ((char *) entries2, (char *) (entries + icount1),
1559 icount2 * sizeof(struct dx_entry));
1560 dx_set_count (entries, icount1);
1561 dx_set_count (entries2, icount2);
1562 dx_set_limit (entries2, dx_node_limit(dir));
1563
1564 /* Which index block gets the new entry? */
1565 if (at - entries >= icount1) {
1566 frame->at = at = at - entries - icount1 + entries2;
1567 frame->entries = entries = entries2;
1568 swap(frame->bh, bh2);
1569 }
1570 dx_insert_block (frames + 0, hash2, newblock);
1571 dxtrace(dx_show_index ("node", frames[1].entries));
1572 dxtrace(dx_show_index ("node",
1573 ((struct dx_node *) bh2->b_data)->entries));
1574 err = ext3_journal_dirty_metadata(handle, bh2);
1575 if (err)
1576 goto journal_error;
1577 brelse (bh2);
1578 } else {
1579 dxtrace(printk("Creating second level index...\n"));
1580 memcpy((char *) entries2, (char *) entries,
1581 icount * sizeof(struct dx_entry));
1582 dx_set_limit(entries2, dx_node_limit(dir));
1583
1584 /* Set up root */
1585 dx_set_count(entries, 1);
1586 dx_set_block(entries + 0, newblock);
1587 ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
1588
1589 /* Add new access path frame */
1590 frame = frames + 1;
1591 frame->at = at = at - entries + entries2;
1592 frame->entries = entries = entries2;
1593 frame->bh = bh2;
1594 err = ext3_journal_get_write_access(handle,
1595 frame->bh);
1596 if (err)
1597 goto journal_error;
1598 }
1599 err = ext3_journal_dirty_metadata(handle, frames[0].bh);
1600 if (err)
1601 goto journal_error;
1602 }
1603 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1604 if (!de)
1605 goto cleanup;
1606 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1607 bh = NULL;
1608 goto cleanup;
1609
1610journal_error:
1611 ext3_std_error(dir->i_sb, err);
1612cleanup:
1613 if (bh)
1614 brelse(bh);
1615 dx_release(frames);
1616 return err;
1617}
1618
1619/*
1620 * ext3_delete_entry deletes a directory entry by merging it with the
1621 * previous entry
1622 */
1623static int ext3_delete_entry (handle_t *handle,
1624 struct inode * dir,
1625 struct ext3_dir_entry_2 * de_del,
1626 struct buffer_head * bh)
1627{
1628 struct ext3_dir_entry_2 * de, * pde;
1629 int i;
1630
1631 i = 0;
1632 pde = NULL;
1633 de = (struct ext3_dir_entry_2 *) bh->b_data;
1634 while (i < bh->b_size) {
1635 if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
1636 return -EIO;
1637 if (de == de_del) {
1638 int err;
1639
1640 BUFFER_TRACE(bh, "get_write_access");
1641 err = ext3_journal_get_write_access(handle, bh);
1642 if (err)
1643 goto journal_error;
1644
1645 if (pde)
1646 pde->rec_len = ext3_rec_len_to_disk(
1647 ext3_rec_len_from_disk(pde->rec_len) +
1648 ext3_rec_len_from_disk(de->rec_len));
1649 else
1650 de->inode = 0;
1651 dir->i_version++;
1652 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1653 err = ext3_journal_dirty_metadata(handle, bh);
1654 if (err) {
1655journal_error:
1656 ext3_std_error(dir->i_sb, err);
1657 return err;
1658 }
1659 return 0;
1660 }
1661 i += ext3_rec_len_from_disk(de->rec_len);
1662 pde = de;
1663 de = ext3_next_entry(de);
1664 }
1665 return -ENOENT;
1666}
1667
1668static int ext3_add_nondir(handle_t *handle,
1669 struct dentry *dentry, struct inode *inode)
1670{
1671 int err = ext3_add_entry(handle, dentry, inode);
1672 if (!err) {
1673 ext3_mark_inode_dirty(handle, inode);
1674 unlock_new_inode(inode);
1675 d_instantiate(dentry, inode);
1676 return 0;
1677 }
1678 drop_nlink(inode);
1679 unlock_new_inode(inode);
1680 iput(inode);
1681 return err;
1682}
1683
1684/*
1685 * By the time this is called, we already have created
1686 * the directory cache entry for the new file, but it
1687 * is so far negative - it has no inode.
1688 *
1689 * If the create succeeds, we fill in the inode information
1690 * with d_instantiate().
1691 */
1692static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode,
1693 bool excl)
1694{
1695 handle_t *handle;
1696 struct inode * inode;
1697 int err, retries = 0;
1698
1699 dquot_initialize(dir);
1700
1701retry:
1702 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1703 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1704 EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1705 if (IS_ERR(handle))
1706 return PTR_ERR(handle);
1707
1708 if (IS_DIRSYNC(dir))
1709 handle->h_sync = 1;
1710
1711 inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
1712 err = PTR_ERR(inode);
1713 if (!IS_ERR(inode)) {
1714 inode->i_op = &ext3_file_inode_operations;
1715 inode->i_fop = &ext3_file_operations;
1716 ext3_set_aops(inode);
1717 err = ext3_add_nondir(handle, dentry, inode);
1718 }
1719 ext3_journal_stop(handle);
1720 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1721 goto retry;
1722 return err;
1723}
1724
1725static int ext3_mknod (struct inode * dir, struct dentry *dentry,
1726 umode_t mode, dev_t rdev)
1727{
1728 handle_t *handle;
1729 struct inode *inode;
1730 int err, retries = 0;
1731
1732 if (!new_valid_dev(rdev))
1733 return -EINVAL;
1734
1735 dquot_initialize(dir);
1736
1737retry:
1738 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1739 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1740 EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1741 if (IS_ERR(handle))
1742 return PTR_ERR(handle);
1743
1744 if (IS_DIRSYNC(dir))
1745 handle->h_sync = 1;
1746
1747 inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
1748 err = PTR_ERR(inode);
1749 if (!IS_ERR(inode)) {
1750 init_special_inode(inode, inode->i_mode, rdev);
1751#ifdef CONFIG_EXT3_FS_XATTR
1752 inode->i_op = &ext3_special_inode_operations;
1753#endif
1754 err = ext3_add_nondir(handle, dentry, inode);
1755 }
1756 ext3_journal_stop(handle);
1757 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1758 goto retry;
1759 return err;
1760}
1761
1762static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
1763{
1764 handle_t *handle;
1765 struct inode *inode;
1766 int err, retries = 0;
1767
1768 dquot_initialize(dir);
1769
1770retry:
1771 handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
1772 4 + EXT3_XATTR_TRANS_BLOCKS);
1773
1774 if (IS_ERR(handle))
1775 return PTR_ERR(handle);
1776
1777 inode = ext3_new_inode (handle, dir, NULL, mode);
1778 err = PTR_ERR(inode);
1779 if (!IS_ERR(inode)) {
1780 inode->i_op = &ext3_file_inode_operations;
1781 inode->i_fop = &ext3_file_operations;
1782 ext3_set_aops(inode);
1783 d_tmpfile(dentry, inode);
1784 err = ext3_orphan_add(handle, inode);
1785 if (err)
1786 goto err_unlock_inode;
1787 mark_inode_dirty(inode);
1788 unlock_new_inode(inode);
1789 }
1790 ext3_journal_stop(handle);
1791 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1792 goto retry;
1793 return err;
1794err_unlock_inode:
1795 ext3_journal_stop(handle);
1796 unlock_new_inode(inode);
1797 return err;
1798}
1799
1800static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
1801{
1802 handle_t *handle;
1803 struct inode * inode;
1804 struct buffer_head * dir_block = NULL;
1805 struct ext3_dir_entry_2 * de;
1806 int err, retries = 0;
1807
1808 if (dir->i_nlink >= EXT3_LINK_MAX)
1809 return -EMLINK;
1810
1811 dquot_initialize(dir);
1812
1813retry:
1814 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1815 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1816 EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1817 if (IS_ERR(handle))
1818 return PTR_ERR(handle);
1819
1820 if (IS_DIRSYNC(dir))
1821 handle->h_sync = 1;
1822
1823 inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode);
1824 err = PTR_ERR(inode);
1825 if (IS_ERR(inode))
1826 goto out_stop;
1827
1828 inode->i_op = &ext3_dir_inode_operations;
1829 inode->i_fop = &ext3_dir_operations;
1830 inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1831 if (!(dir_block = ext3_dir_bread(handle, inode, 0, 1, &err)))
1832 goto out_clear_inode;
1833
1834 BUFFER_TRACE(dir_block, "get_write_access");
1835 err = ext3_journal_get_write_access(handle, dir_block);
1836 if (err)
1837 goto out_clear_inode;
1838
1839 de = (struct ext3_dir_entry_2 *) dir_block->b_data;
1840 de->inode = cpu_to_le32(inode->i_ino);
1841 de->name_len = 1;
1842 de->rec_len = ext3_rec_len_to_disk(EXT3_DIR_REC_LEN(de->name_len));
1843 strcpy (de->name, ".");
1844 ext3_set_de_type(dir->i_sb, de, S_IFDIR);
1845 de = ext3_next_entry(de);
1846 de->inode = cpu_to_le32(dir->i_ino);
1847 de->rec_len = ext3_rec_len_to_disk(inode->i_sb->s_blocksize -
1848 EXT3_DIR_REC_LEN(1));
1849 de->name_len = 2;
1850 strcpy (de->name, "..");
1851 ext3_set_de_type(dir->i_sb, de, S_IFDIR);
1852 set_nlink(inode, 2);
1853 BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
1854 err = ext3_journal_dirty_metadata(handle, dir_block);
1855 if (err)
1856 goto out_clear_inode;
1857
1858 err = ext3_mark_inode_dirty(handle, inode);
1859 if (!err)
1860 err = ext3_add_entry (handle, dentry, inode);
1861
1862 if (err) {
1863out_clear_inode:
1864 clear_nlink(inode);
1865 unlock_new_inode(inode);
1866 ext3_mark_inode_dirty(handle, inode);
1867 iput (inode);
1868 goto out_stop;
1869 }
1870 inc_nlink(dir);
1871 ext3_update_dx_flag(dir);
1872 err = ext3_mark_inode_dirty(handle, dir);
1873 if (err)
1874 goto out_clear_inode;
1875
1876 unlock_new_inode(inode);
1877 d_instantiate(dentry, inode);
1878out_stop:
1879 brelse(dir_block);
1880 ext3_journal_stop(handle);
1881 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1882 goto retry;
1883 return err;
1884}
1885
1886/*
1887 * routine to check that the specified directory is empty (for rmdir)
1888 */
1889static int empty_dir (struct inode * inode)
1890{
1891 unsigned long offset;
1892 struct buffer_head * bh;
1893 struct ext3_dir_entry_2 * de, * de1;
1894 struct super_block * sb;
1895 int err = 0;
1896
1897 sb = inode->i_sb;
1898 if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
1899 !(bh = ext3_dir_bread(NULL, inode, 0, 0, &err))) {
1900 if (err)
1901 ext3_error(inode->i_sb, __func__,
1902 "error %d reading directory #%lu offset 0",
1903 err, inode->i_ino);
1904 else
1905 ext3_warning(inode->i_sb, __func__,
1906 "bad directory (dir #%lu) - no data block",
1907 inode->i_ino);
1908 return 1;
1909 }
1910 de = (struct ext3_dir_entry_2 *) bh->b_data;
1911 de1 = ext3_next_entry(de);
1912 if (le32_to_cpu(de->inode) != inode->i_ino ||
1913 !le32_to_cpu(de1->inode) ||
1914 strcmp (".", de->name) ||
1915 strcmp ("..", de1->name)) {
1916 ext3_warning (inode->i_sb, "empty_dir",
1917 "bad directory (dir #%lu) - no `.' or `..'",
1918 inode->i_ino);
1919 brelse (bh);
1920 return 1;
1921 }
1922 offset = ext3_rec_len_from_disk(de->rec_len) +
1923 ext3_rec_len_from_disk(de1->rec_len);
1924 de = ext3_next_entry(de1);
1925 while (offset < inode->i_size ) {
1926 if (!bh ||
1927 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1928 err = 0;
1929 brelse (bh);
1930 if (!(bh = ext3_dir_bread (NULL, inode,
1931 offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err))) {
1932 if (err)
1933 ext3_error(sb, __func__,
1934 "error %d reading directory"
1935 " #%lu offset %lu",
1936 err, inode->i_ino, offset);
1937 offset += sb->s_blocksize;
1938 continue;
1939 }
1940 de = (struct ext3_dir_entry_2 *) bh->b_data;
1941 }
1942 if (!ext3_check_dir_entry("empty_dir", inode, de, bh, offset)) {
1943 de = (struct ext3_dir_entry_2 *)(bh->b_data +
1944 sb->s_blocksize);
1945 offset = (offset | (sb->s_blocksize - 1)) + 1;
1946 continue;
1947 }
1948 if (le32_to_cpu(de->inode)) {
1949 brelse (bh);
1950 return 0;
1951 }
1952 offset += ext3_rec_len_from_disk(de->rec_len);
1953 de = ext3_next_entry(de);
1954 }
1955 brelse (bh);
1956 return 1;
1957}
1958
1959/* ext3_orphan_add() links an unlinked or truncated inode into a list of
1960 * such inodes, starting at the superblock, in case we crash before the
1961 * file is closed/deleted, or in case the inode truncate spans multiple
1962 * transactions and the last transaction is not recovered after a crash.
1963 *
1964 * At filesystem recovery time, we walk this list deleting unlinked
1965 * inodes and truncating linked inodes in ext3_orphan_cleanup().
1966 */
1967int ext3_orphan_add(handle_t *handle, struct inode *inode)
1968{
1969 struct super_block *sb = inode->i_sb;
1970 struct ext3_iloc iloc;
1971 int err = 0, rc;
1972
1973 mutex_lock(&EXT3_SB(sb)->s_orphan_lock);
1974 if (!list_empty(&EXT3_I(inode)->i_orphan))
1975 goto out_unlock;
1976
1977 /* Orphan handling is only valid for files with data blocks
1978 * being truncated, or files being unlinked. */
1979
1980 /* @@@ FIXME: Observation from aviro:
1981 * I think I can trigger J_ASSERT in ext3_orphan_add(). We block
1982 * here (on s_orphan_lock), so race with ext3_link() which might bump
1983 * ->i_nlink. For, say it, character device. Not a regular file,
1984 * not a directory, not a symlink and ->i_nlink > 0.
1985 *
1986 * tytso, 4/25/2009: I'm not sure how that could happen;
1987 * shouldn't the fs core protect us from these sort of
1988 * unlink()/link() races?
1989 */
1990 J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1991 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
1992
1993 BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
1994 err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
1995 if (err)
1996 goto out_unlock;
1997
1998 err = ext3_reserve_inode_write(handle, inode, &iloc);
1999 if (err)
2000 goto out_unlock;
2001
2002 /* Insert this inode at the head of the on-disk orphan list... */
2003 NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
2004 EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
2005 err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
2006 rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
2007 if (!err)
2008 err = rc;
2009
2010 /* Only add to the head of the in-memory list if all the
2011 * previous operations succeeded. If the orphan_add is going to
2012 * fail (possibly taking the journal offline), we can't risk
2013 * leaving the inode on the orphan list: stray orphan-list
2014 * entries can cause panics at unmount time.
2015 *
2016 * This is safe: on error we're going to ignore the orphan list
2017 * anyway on the next recovery. */
2018 if (!err)
2019 list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
2020
2021 jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
2022 jbd_debug(4, "orphan inode %lu will point to %d\n",
2023 inode->i_ino, NEXT_ORPHAN(inode));
2024out_unlock:
2025 mutex_unlock(&EXT3_SB(sb)->s_orphan_lock);
2026 ext3_std_error(inode->i_sb, err);
2027 return err;
2028}
2029
2030/*
2031 * ext3_orphan_del() removes an unlinked or truncated inode from the list
2032 * of such inodes stored on disk, because it is finally being cleaned up.
2033 */
2034int ext3_orphan_del(handle_t *handle, struct inode *inode)
2035{
2036 struct list_head *prev;
2037 struct ext3_inode_info *ei = EXT3_I(inode);
2038 struct ext3_sb_info *sbi;
2039 unsigned long ino_next;
2040 struct ext3_iloc iloc;
2041 int err = 0;
2042
2043 mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
2044 if (list_empty(&ei->i_orphan))
2045 goto out;
2046
2047 ino_next = NEXT_ORPHAN(inode);
2048 prev = ei->i_orphan.prev;
2049 sbi = EXT3_SB(inode->i_sb);
2050
2051 jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
2052
2053 list_del_init(&ei->i_orphan);
2054
2055 /* If we're on an error path, we may not have a valid
2056 * transaction handle with which to update the orphan list on
2057 * disk, but we still need to remove the inode from the linked
2058 * list in memory. */
2059 if (!handle)
2060 goto out;
2061
2062 err = ext3_reserve_inode_write(handle, inode, &iloc);
2063 if (err)
2064 goto out_err;
2065
2066 if (prev == &sbi->s_orphan) {
2067 jbd_debug(4, "superblock will point to %lu\n", ino_next);
2068 BUFFER_TRACE(sbi->s_sbh, "get_write_access");
2069 err = ext3_journal_get_write_access(handle, sbi->s_sbh);
2070 if (err)
2071 goto out_brelse;
2072 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2073 err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
2074 } else {
2075 struct ext3_iloc iloc2;
2076 struct inode *i_prev =
2077 &list_entry(prev, struct ext3_inode_info, i_orphan)->vfs_inode;
2078
2079 jbd_debug(4, "orphan inode %lu will point to %lu\n",
2080 i_prev->i_ino, ino_next);
2081 err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
2082 if (err)
2083 goto out_brelse;
2084 NEXT_ORPHAN(i_prev) = ino_next;
2085 err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2);
2086 }
2087 if (err)
2088 goto out_brelse;
2089 NEXT_ORPHAN(inode) = 0;
2090 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2091
2092out_err:
2093 ext3_std_error(inode->i_sb, err);
2094out:
2095 mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
2096 return err;
2097
2098out_brelse:
2099 brelse(iloc.bh);
2100 goto out_err;
2101}
2102
2103static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
2104{
2105 int retval;
2106 struct inode * inode;
2107 struct buffer_head * bh;
2108 struct ext3_dir_entry_2 * de;
2109 handle_t *handle;
2110
2111 /* Initialize quotas before so that eventual writes go in
2112 * separate transaction */
2113 dquot_initialize(dir);
2114 dquot_initialize(d_inode(dentry));
2115
2116 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2117 if (IS_ERR(handle))
2118 return PTR_ERR(handle);
2119
2120 retval = -ENOENT;
2121 bh = ext3_find_entry(dir, &dentry->d_name, &de);
2122 if (!bh)
2123 goto end_rmdir;
2124
2125 if (IS_DIRSYNC(dir))
2126 handle->h_sync = 1;
2127
2128 inode = d_inode(dentry);
2129
2130 retval = -EIO;
2131 if (le32_to_cpu(de->inode) != inode->i_ino)
2132 goto end_rmdir;
2133
2134 retval = -ENOTEMPTY;
2135 if (!empty_dir (inode))
2136 goto end_rmdir;
2137
2138 retval = ext3_delete_entry(handle, dir, de, bh);
2139 if (retval)
2140 goto end_rmdir;
2141 if (inode->i_nlink != 2)
2142 ext3_warning (inode->i_sb, "ext3_rmdir",
2143 "empty directory has nlink!=2 (%d)",
2144 inode->i_nlink);
2145 inode->i_version++;
2146 clear_nlink(inode);
2147 /* There's no need to set i_disksize: the fact that i_nlink is
2148 * zero will ensure that the right thing happens during any
2149 * recovery. */
2150 inode->i_size = 0;
2151 ext3_orphan_add(handle, inode);
2152 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2153 ext3_mark_inode_dirty(handle, inode);
2154 drop_nlink(dir);
2155 ext3_update_dx_flag(dir);
2156 ext3_mark_inode_dirty(handle, dir);
2157
2158end_rmdir:
2159 ext3_journal_stop(handle);
2160 brelse (bh);
2161 return retval;
2162}
2163
2164static int ext3_unlink(struct inode * dir, struct dentry *dentry)
2165{
2166 int retval;
2167 struct inode * inode;
2168 struct buffer_head * bh;
2169 struct ext3_dir_entry_2 * de;
2170 handle_t *handle;
2171
2172 trace_ext3_unlink_enter(dir, dentry);
2173 /* Initialize quotas before so that eventual writes go
2174 * in separate transaction */
2175 dquot_initialize(dir);
2176 dquot_initialize(d_inode(dentry));
2177
2178 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2179 if (IS_ERR(handle))
2180 return PTR_ERR(handle);
2181
2182 if (IS_DIRSYNC(dir))
2183 handle->h_sync = 1;
2184
2185 retval = -ENOENT;
2186 bh = ext3_find_entry(dir, &dentry->d_name, &de);
2187 if (!bh)
2188 goto end_unlink;
2189
2190 inode = d_inode(dentry);
2191
2192 retval = -EIO;
2193 if (le32_to_cpu(de->inode) != inode->i_ino)
2194 goto end_unlink;
2195
2196 if (!inode->i_nlink) {
2197 ext3_warning (inode->i_sb, "ext3_unlink",
2198 "Deleting nonexistent file (%lu), %d",
2199 inode->i_ino, inode->i_nlink);
2200 set_nlink(inode, 1);
2201 }
2202 retval = ext3_delete_entry(handle, dir, de, bh);
2203 if (retval)
2204 goto end_unlink;
2205 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2206 ext3_update_dx_flag(dir);
2207 ext3_mark_inode_dirty(handle, dir);
2208 drop_nlink(inode);
2209 if (!inode->i_nlink)
2210 ext3_orphan_add(handle, inode);
2211 inode->i_ctime = dir->i_ctime;
2212 ext3_mark_inode_dirty(handle, inode);
2213 retval = 0;
2214
2215end_unlink:
2216 ext3_journal_stop(handle);
2217 brelse (bh);
2218 trace_ext3_unlink_exit(dentry, retval);
2219 return retval;
2220}
2221
2222static int ext3_symlink (struct inode * dir,
2223 struct dentry *dentry, const char * symname)
2224{
2225 handle_t *handle;
2226 struct inode * inode;
2227 int l, err, retries = 0;
2228 int credits;
2229
2230 l = strlen(symname)+1;
2231 if (l > dir->i_sb->s_blocksize)
2232 return -ENAMETOOLONG;
2233
2234 dquot_initialize(dir);
2235
2236 if (l > EXT3_N_BLOCKS * 4) {
2237 /*
2238 * For non-fast symlinks, we just allocate inode and put it on
2239 * orphan list in the first transaction => we need bitmap,
2240 * group descriptor, sb, inode block, quota blocks, and
2241 * possibly selinux xattr blocks.
2242 */
2243 credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
2244 EXT3_XATTR_TRANS_BLOCKS;
2245 } else {
2246 /*
2247 * Fast symlink. We have to add entry to directory
2248 * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
2249 * allocate new inode (bitmap, group descriptor, inode block,
2250 * quota blocks, sb is already counted in previous macros).
2251 */
2252 credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2253 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
2254 EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2255 }
2256retry:
2257 handle = ext3_journal_start(dir, credits);
2258 if (IS_ERR(handle))
2259 return PTR_ERR(handle);
2260
2261 if (IS_DIRSYNC(dir))
2262 handle->h_sync = 1;
2263
2264 inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO);
2265 err = PTR_ERR(inode);
2266 if (IS_ERR(inode))
2267 goto out_stop;
2268
2269 if (l > EXT3_N_BLOCKS * 4) {
2270 inode->i_op = &ext3_symlink_inode_operations;
2271 ext3_set_aops(inode);
2272 /*
2273 * We cannot call page_symlink() with transaction started
2274 * because it calls into ext3_write_begin() which acquires page
2275 * lock which ranks below transaction start (and it can also
2276 * wait for journal commit if we are running out of space). So
2277 * we have to stop transaction now and restart it when symlink
2278 * contents is written.
2279 *
2280 * To keep fs consistent in case of crash, we have to put inode
2281 * to orphan list in the mean time.
2282 */
2283 drop_nlink(inode);
2284 err = ext3_orphan_add(handle, inode);
2285 ext3_journal_stop(handle);
2286 if (err)
2287 goto err_drop_inode;
2288 err = __page_symlink(inode, symname, l, 1);
2289 if (err)
2290 goto err_drop_inode;
2291 /*
2292 * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
2293 * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
2294 */
2295 handle = ext3_journal_start(dir,
2296 EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2297 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
2298 if (IS_ERR(handle)) {
2299 err = PTR_ERR(handle);
2300 goto err_drop_inode;
2301 }
2302 set_nlink(inode, 1);
2303 err = ext3_orphan_del(handle, inode);
2304 if (err) {
2305 ext3_journal_stop(handle);
2306 drop_nlink(inode);
2307 goto err_drop_inode;
2308 }
2309 } else {
2310 inode->i_op = &ext3_fast_symlink_inode_operations;
2311 inode->i_link = (char*)&EXT3_I(inode)->i_data;
2312 memcpy(inode->i_link, symname, l);
2313 inode->i_size = l-1;
2314 }
2315 EXT3_I(inode)->i_disksize = inode->i_size;
2316 err = ext3_add_nondir(handle, dentry, inode);
2317out_stop:
2318 ext3_journal_stop(handle);
2319 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
2320 goto retry;
2321 return err;
2322err_drop_inode:
2323 unlock_new_inode(inode);
2324 iput(inode);
2325 return err;
2326}
2327
2328static int ext3_link (struct dentry * old_dentry,
2329 struct inode * dir, struct dentry *dentry)
2330{
2331 handle_t *handle;
2332 struct inode *inode = d_inode(old_dentry);
2333 int err, retries = 0;
2334
2335 if (inode->i_nlink >= EXT3_LINK_MAX)
2336 return -EMLINK;
2337
2338 dquot_initialize(dir);
2339
2340retry:
2341 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2342 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
2343 if (IS_ERR(handle))
2344 return PTR_ERR(handle);
2345
2346 if (IS_DIRSYNC(dir))
2347 handle->h_sync = 1;
2348
2349 inode->i_ctime = CURRENT_TIME_SEC;
2350 inc_nlink(inode);
2351 ihold(inode);
2352
2353 err = ext3_add_entry(handle, dentry, inode);
2354 if (!err) {
2355 ext3_mark_inode_dirty(handle, inode);
2356 /* this can happen only for tmpfile being
2357 * linked the first time
2358 */
2359 if (inode->i_nlink == 1)
2360 ext3_orphan_del(handle, inode);
2361 d_instantiate(dentry, inode);
2362 } else {
2363 drop_nlink(inode);
2364 iput(inode);
2365 }
2366 ext3_journal_stop(handle);
2367 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
2368 goto retry;
2369 return err;
2370}
2371
2372#define PARENT_INO(buffer) \
2373 (ext3_next_entry((struct ext3_dir_entry_2 *)(buffer))->inode)
2374
2375/*
2376 * Anybody can rename anything with this: the permission checks are left to the
2377 * higher-level routines.
2378 */
2379static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2380 struct inode * new_dir,struct dentry *new_dentry)
2381{
2382 handle_t *handle;
2383 struct inode * old_inode, * new_inode;
2384 struct buffer_head * old_bh, * new_bh, * dir_bh;
2385 struct ext3_dir_entry_2 * old_de, * new_de;
2386 int retval, flush_file = 0;
2387
2388 dquot_initialize(old_dir);
2389 dquot_initialize(new_dir);
2390
2391 old_bh = new_bh = dir_bh = NULL;
2392
2393 /* Initialize quotas before so that eventual writes go
2394 * in separate transaction */
2395 if (d_really_is_positive(new_dentry))
2396 dquot_initialize(d_inode(new_dentry));
2397 handle = ext3_journal_start(old_dir, 2 *
2398 EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2399 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
2400 if (IS_ERR(handle))
2401 return PTR_ERR(handle);
2402
2403 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2404 handle->h_sync = 1;
2405
2406 old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de);
2407 /*
2408 * Check for inode number is _not_ due to possible IO errors.
2409 * We might rmdir the source, keep it as pwd of some process
2410 * and merrily kill the link to whatever was created under the
2411 * same name. Goodbye sticky bit ;-<
2412 */
2413 old_inode = d_inode(old_dentry);
2414 retval = -ENOENT;
2415 if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
2416 goto end_rename;
2417
2418 new_inode = d_inode(new_dentry);
2419 new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de);
2420 if (new_bh) {
2421 if (!new_inode) {
2422 brelse (new_bh);
2423 new_bh = NULL;
2424 }
2425 }
2426 if (S_ISDIR(old_inode->i_mode)) {
2427 if (new_inode) {
2428 retval = -ENOTEMPTY;
2429 if (!empty_dir (new_inode))
2430 goto end_rename;
2431 }
2432 retval = -EIO;
2433 dir_bh = ext3_dir_bread(handle, old_inode, 0, 0, &retval);
2434 if (!dir_bh)
2435 goto end_rename;
2436 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
2437 goto end_rename;
2438 retval = -EMLINK;
2439 if (!new_inode && new_dir!=old_dir &&
2440 new_dir->i_nlink >= EXT3_LINK_MAX)
2441 goto end_rename;
2442 }
2443 if (!new_bh) {
2444 retval = ext3_add_entry (handle, new_dentry, old_inode);
2445 if (retval)
2446 goto end_rename;
2447 } else {
2448 BUFFER_TRACE(new_bh, "get write access");
2449 retval = ext3_journal_get_write_access(handle, new_bh);
2450 if (retval)
2451 goto journal_error;
2452 new_de->inode = cpu_to_le32(old_inode->i_ino);
2453 if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2454 EXT3_FEATURE_INCOMPAT_FILETYPE))
2455 new_de->file_type = old_de->file_type;
2456 new_dir->i_version++;
2457 new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
2458 ext3_mark_inode_dirty(handle, new_dir);
2459 BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
2460 retval = ext3_journal_dirty_metadata(handle, new_bh);
2461 if (retval)
2462 goto journal_error;
2463 brelse(new_bh);
2464 new_bh = NULL;
2465 }
2466
2467 /*
2468 * Like most other Unix systems, set the ctime for inodes on a
2469 * rename.
2470 */
2471 old_inode->i_ctime = CURRENT_TIME_SEC;
2472 ext3_mark_inode_dirty(handle, old_inode);
2473
2474 /*
2475 * ok, that's it
2476 */
2477 if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
2478 old_de->name_len != old_dentry->d_name.len ||
2479 strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
2480 (retval = ext3_delete_entry(handle, old_dir,
2481 old_de, old_bh)) == -ENOENT) {
2482 /* old_de could have moved from under us during htree split, so
2483 * make sure that we are deleting the right entry. We might
2484 * also be pointing to a stale entry in the unused part of
2485 * old_bh so just checking inum and the name isn't enough. */
2486 struct buffer_head *old_bh2;
2487 struct ext3_dir_entry_2 *old_de2;
2488
2489 old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name,
2490 &old_de2);
2491 if (old_bh2) {
2492 retval = ext3_delete_entry(handle, old_dir,
2493 old_de2, old_bh2);
2494 brelse(old_bh2);
2495 }
2496 }
2497 if (retval) {
2498 ext3_warning(old_dir->i_sb, "ext3_rename",
2499 "Deleting old file (%lu), %d, error=%d",
2500 old_dir->i_ino, old_dir->i_nlink, retval);
2501 }
2502
2503 if (new_inode) {
2504 drop_nlink(new_inode);
2505 new_inode->i_ctime = CURRENT_TIME_SEC;
2506 }
2507 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
2508 ext3_update_dx_flag(old_dir);
2509 if (dir_bh) {
2510 BUFFER_TRACE(dir_bh, "get_write_access");
2511 retval = ext3_journal_get_write_access(handle, dir_bh);
2512 if (retval)
2513 goto journal_error;
2514 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2515 BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
2516 retval = ext3_journal_dirty_metadata(handle, dir_bh);
2517 if (retval) {
2518journal_error:
2519 ext3_std_error(new_dir->i_sb, retval);
2520 goto end_rename;
2521 }
2522 drop_nlink(old_dir);
2523 if (new_inode) {
2524 drop_nlink(new_inode);
2525 } else {
2526 inc_nlink(new_dir);
2527 ext3_update_dx_flag(new_dir);
2528 ext3_mark_inode_dirty(handle, new_dir);
2529 }
2530 }
2531 ext3_mark_inode_dirty(handle, old_dir);
2532 if (new_inode) {
2533 ext3_mark_inode_dirty(handle, new_inode);
2534 if (!new_inode->i_nlink)
2535 ext3_orphan_add(handle, new_inode);
2536 if (ext3_should_writeback_data(new_inode))
2537 flush_file = 1;
2538 }
2539 retval = 0;
2540
2541end_rename:
2542 brelse (dir_bh);
2543 brelse (old_bh);
2544 brelse (new_bh);
2545 ext3_journal_stop(handle);
2546 if (retval == 0 && flush_file)
2547 filemap_flush(old_inode->i_mapping);
2548 return retval;
2549}
2550
2551/*
2552 * directories can handle most operations...
2553 */
2554const struct inode_operations ext3_dir_inode_operations = {
2555 .create = ext3_create,
2556 .lookup = ext3_lookup,
2557 .link = ext3_link,
2558 .unlink = ext3_unlink,
2559 .symlink = ext3_symlink,
2560 .mkdir = ext3_mkdir,
2561 .rmdir = ext3_rmdir,
2562 .mknod = ext3_mknod,
2563 .tmpfile = ext3_tmpfile,
2564 .rename = ext3_rename,
2565 .setattr = ext3_setattr,
2566#ifdef CONFIG_EXT3_FS_XATTR
2567 .setxattr = generic_setxattr,
2568 .getxattr = generic_getxattr,
2569 .listxattr = ext3_listxattr,
2570 .removexattr = generic_removexattr,
2571#endif
2572 .get_acl = ext3_get_acl,
2573 .set_acl = ext3_set_acl,
2574};
2575
2576const struct inode_operations ext3_special_inode_operations = {
2577 .setattr = ext3_setattr,
2578#ifdef CONFIG_EXT3_FS_XATTR
2579 .setxattr = generic_setxattr,
2580 .getxattr = generic_getxattr,
2581 .listxattr = ext3_listxattr,
2582 .removexattr = generic_removexattr,
2583#endif
2584 .get_acl = ext3_get_acl,
2585 .set_acl = ext3_set_acl,
2586};
diff --git a/fs/ext3/namei.h b/fs/ext3/namei.h
deleted file mode 100644
index 46304d8c9f0a..000000000000
--- a/fs/ext3/namei.h
+++ /dev/null
@@ -1,27 +0,0 @@
1/* linux/fs/ext3/namei.h
2 *
3 * Copyright (C) 2005 Simtec Electronics
4 * Ben Dooks <ben@simtec.co.uk>
5 *
6*/
7
8extern struct dentry *ext3_get_parent(struct dentry *child);
9
10static inline struct buffer_head *ext3_dir_bread(handle_t *handle,
11 struct inode *inode,
12 int block, int create,
13 int *err)
14{
15 struct buffer_head *bh;
16
17 bh = ext3_bread(handle, inode, block, create, err);
18
19 if (!bh && !(*err)) {
20 *err = -EIO;
21 ext3_error(inode->i_sb, __func__,
22 "Directory hole detected on inode %lu\n",
23 inode->i_ino);
24 return NULL;
25 }
26 return bh;
27}
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
deleted file mode 100644
index 27105655502c..000000000000
--- a/fs/ext3/resize.c
+++ /dev/null
@@ -1,1117 +0,0 @@
1/*
2 * linux/fs/ext3/resize.c
3 *
4 * Support for resizing an ext3 filesystem while it is mounted.
5 *
6 * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
7 *
8 * This could probably be made into a module, because it is not often in use.
9 */
10
11
12#define EXT3FS_DEBUG
13
14#include "ext3.h"
15
16
17#define outside(b, first, last) ((b) < (first) || (b) >= (last))
18#define inside(b, first, last) ((b) >= (first) && (b) < (last))
19
20static int verify_group_input(struct super_block *sb,
21 struct ext3_new_group_data *input)
22{
23 struct ext3_sb_info *sbi = EXT3_SB(sb);
24 struct ext3_super_block *es = sbi->s_es;
25 ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count);
26 ext3_fsblk_t end = start + input->blocks_count;
27 unsigned group = input->group;
28 ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
29 unsigned overhead = ext3_bg_has_super(sb, group) ?
30 (1 + ext3_bg_num_gdb(sb, group) +
31 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
32 ext3_fsblk_t metaend = start + overhead;
33 struct buffer_head *bh = NULL;
34 ext3_grpblk_t free_blocks_count;
35 int err = -EINVAL;
36
37 input->free_blocks_count = free_blocks_count =
38 input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
39
40 if (test_opt(sb, DEBUG))
41 printk(KERN_DEBUG "EXT3-fs: adding %s group %u: %u blocks "
42 "(%d free, %u reserved)\n",
43 ext3_bg_has_super(sb, input->group) ? "normal" :
44 "no-super", input->group, input->blocks_count,
45 free_blocks_count, input->reserved_blocks);
46
47 if (group != sbi->s_groups_count)
48 ext3_warning(sb, __func__,
49 "Cannot add at group %u (only %lu groups)",
50 input->group, sbi->s_groups_count);
51 else if ((start - le32_to_cpu(es->s_first_data_block)) %
52 EXT3_BLOCKS_PER_GROUP(sb))
53 ext3_warning(sb, __func__, "Last group not full");
54 else if (input->reserved_blocks > input->blocks_count / 5)
55 ext3_warning(sb, __func__, "Reserved blocks too high (%u)",
56 input->reserved_blocks);
57 else if (free_blocks_count < 0)
58 ext3_warning(sb, __func__, "Bad blocks count %u",
59 input->blocks_count);
60 else if (!(bh = sb_bread(sb, end - 1)))
61 ext3_warning(sb, __func__,
62 "Cannot read last block ("E3FSBLK")",
63 end - 1);
64 else if (outside(input->block_bitmap, start, end))
65 ext3_warning(sb, __func__,
66 "Block bitmap not in group (block %u)",
67 input->block_bitmap);
68 else if (outside(input->inode_bitmap, start, end))
69 ext3_warning(sb, __func__,
70 "Inode bitmap not in group (block %u)",
71 input->inode_bitmap);
72 else if (outside(input->inode_table, start, end) ||
73 outside(itend - 1, start, end))
74 ext3_warning(sb, __func__,
75 "Inode table not in group (blocks %u-"E3FSBLK")",
76 input->inode_table, itend - 1);
77 else if (input->inode_bitmap == input->block_bitmap)
78 ext3_warning(sb, __func__,
79 "Block bitmap same as inode bitmap (%u)",
80 input->block_bitmap);
81 else if (inside(input->block_bitmap, input->inode_table, itend))
82 ext3_warning(sb, __func__,
83 "Block bitmap (%u) in inode table (%u-"E3FSBLK")",
84 input->block_bitmap, input->inode_table, itend-1);
85 else if (inside(input->inode_bitmap, input->inode_table, itend))
86 ext3_warning(sb, __func__,
87 "Inode bitmap (%u) in inode table (%u-"E3FSBLK")",
88 input->inode_bitmap, input->inode_table, itend-1);
89 else if (inside(input->block_bitmap, start, metaend))
90 ext3_warning(sb, __func__,
91 "Block bitmap (%u) in GDT table"
92 " ("E3FSBLK"-"E3FSBLK")",
93 input->block_bitmap, start, metaend - 1);
94 else if (inside(input->inode_bitmap, start, metaend))
95 ext3_warning(sb, __func__,
96 "Inode bitmap (%u) in GDT table"
97 " ("E3FSBLK"-"E3FSBLK")",
98 input->inode_bitmap, start, metaend - 1);
99 else if (inside(input->inode_table, start, metaend) ||
100 inside(itend - 1, start, metaend))
101 ext3_warning(sb, __func__,
102 "Inode table (%u-"E3FSBLK") overlaps"
103 "GDT table ("E3FSBLK"-"E3FSBLK")",
104 input->inode_table, itend - 1, start, metaend - 1);
105 else
106 err = 0;
107 brelse(bh);
108
109 return err;
110}
111
112static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
113 ext3_fsblk_t blk)
114{
115 struct buffer_head *bh;
116 int err;
117
118 bh = sb_getblk(sb, blk);
119 if (unlikely(!bh))
120 return ERR_PTR(-ENOMEM);
121 if ((err = ext3_journal_get_write_access(handle, bh))) {
122 brelse(bh);
123 bh = ERR_PTR(err);
124 } else {
125 lock_buffer(bh);
126 memset(bh->b_data, 0, sb->s_blocksize);
127 set_buffer_uptodate(bh);
128 unlock_buffer(bh);
129 }
130
131 return bh;
132}
133
134/*
135 * To avoid calling the atomic setbit hundreds or thousands of times, we only
136 * need to use it within a single byte (to ensure we get endianness right).
137 * We can use memset for the rest of the bitmap as there are no other users.
138 */
139static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
140{
141 int i;
142
143 if (start_bit >= end_bit)
144 return;
145
146 ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
147 for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
148 ext3_set_bit(i, bitmap);
149 if (i < end_bit)
150 memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
151}
152
153/*
154 * If we have fewer than thresh credits, extend by EXT3_MAX_TRANS_DATA.
155 * If that fails, restart the transaction & regain write access for the
156 * buffer head which is used for block_bitmap modifications.
157 */
158static int extend_or_restart_transaction(handle_t *handle, int thresh,
159 struct buffer_head *bh)
160{
161 int err;
162
163 if (handle->h_buffer_credits >= thresh)
164 return 0;
165
166 err = ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA);
167 if (err < 0)
168 return err;
169 if (err) {
170 err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA);
171 if (err)
172 return err;
173 err = ext3_journal_get_write_access(handle, bh);
174 if (err)
175 return err;
176 }
177
178 return 0;
179}
180
181/*
182 * Set up the block and inode bitmaps, and the inode table for the new group.
183 * This doesn't need to be part of the main transaction, since we are only
184 * changing blocks outside the actual filesystem. We still do journaling to
185 * ensure the recovery is correct in case of a failure just after resize.
186 * If any part of this fails, we simply abort the resize.
187 */
188static int setup_new_group_blocks(struct super_block *sb,
189 struct ext3_new_group_data *input)
190{
191 struct ext3_sb_info *sbi = EXT3_SB(sb);
192 ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group);
193 int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
194 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
195 unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group);
196 struct buffer_head *bh;
197 handle_t *handle;
198 ext3_fsblk_t block;
199 ext3_grpblk_t bit;
200 int i;
201 int err = 0, err2;
202
203 /* This transaction may be extended/restarted along the way */
204 handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA);
205
206 if (IS_ERR(handle))
207 return PTR_ERR(handle);
208
209 mutex_lock(&sbi->s_resize_lock);
210 if (input->group != sbi->s_groups_count) {
211 err = -EBUSY;
212 goto exit_journal;
213 }
214
215 if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
216 err = PTR_ERR(bh);
217 goto exit_journal;
218 }
219
220 if (ext3_bg_has_super(sb, input->group)) {
221 ext3_debug("mark backup superblock %#04lx (+0)\n", start);
222 ext3_set_bit(0, bh->b_data);
223 }
224
225 /* Copy all of the GDT blocks into the backup in this group */
226 for (i = 0, bit = 1, block = start + 1;
227 i < gdblocks; i++, block++, bit++) {
228 struct buffer_head *gdb;
229
230 ext3_debug("update backup group %#04lx (+%d)\n", block, bit);
231
232 err = extend_or_restart_transaction(handle, 1, bh);
233 if (err)
234 goto exit_bh;
235
236 gdb = sb_getblk(sb, block);
237 if (unlikely(!gdb)) {
238 err = -ENOMEM;
239 goto exit_bh;
240 }
241 if ((err = ext3_journal_get_write_access(handle, gdb))) {
242 brelse(gdb);
243 goto exit_bh;
244 }
245 lock_buffer(gdb);
246 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
247 set_buffer_uptodate(gdb);
248 unlock_buffer(gdb);
249 err = ext3_journal_dirty_metadata(handle, gdb);
250 if (err) {
251 brelse(gdb);
252 goto exit_bh;
253 }
254 ext3_set_bit(bit, bh->b_data);
255 brelse(gdb);
256 }
257
258 /* Zero out all of the reserved backup group descriptor table blocks */
259 for (i = 0, bit = gdblocks + 1, block = start + bit;
260 i < reserved_gdb; i++, block++, bit++) {
261 struct buffer_head *gdb;
262
263 ext3_debug("clear reserved block %#04lx (+%d)\n", block, bit);
264
265 err = extend_or_restart_transaction(handle, 1, bh);
266 if (err)
267 goto exit_bh;
268
269 if (IS_ERR(gdb = bclean(handle, sb, block))) {
270 err = PTR_ERR(gdb);
271 goto exit_bh;
272 }
273 err = ext3_journal_dirty_metadata(handle, gdb);
274 if (err) {
275 brelse(gdb);
276 goto exit_bh;
277 }
278 ext3_set_bit(bit, bh->b_data);
279 brelse(gdb);
280 }
281 ext3_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap,
282 input->block_bitmap - start);
283 ext3_set_bit(input->block_bitmap - start, bh->b_data);
284 ext3_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap,
285 input->inode_bitmap - start);
286 ext3_set_bit(input->inode_bitmap - start, bh->b_data);
287
288 /* Zero out all of the inode table blocks */
289 for (i = 0, block = input->inode_table, bit = block - start;
290 i < sbi->s_itb_per_group; i++, bit++, block++) {
291 struct buffer_head *it;
292
293 ext3_debug("clear inode block %#04lx (+%d)\n", block, bit);
294
295 err = extend_or_restart_transaction(handle, 1, bh);
296 if (err)
297 goto exit_bh;
298
299 if (IS_ERR(it = bclean(handle, sb, block))) {
300 err = PTR_ERR(it);
301 goto exit_bh;
302 }
303 err = ext3_journal_dirty_metadata(handle, it);
304 if (err) {
305 brelse(it);
306 goto exit_bh;
307 }
308 brelse(it);
309 ext3_set_bit(bit, bh->b_data);
310 }
311
312 err = extend_or_restart_transaction(handle, 2, bh);
313 if (err)
314 goto exit_bh;
315
316 mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
317 bh->b_data);
318 err = ext3_journal_dirty_metadata(handle, bh);
319 if (err)
320 goto exit_bh;
321 brelse(bh);
322
323 /* Mark unused entries in inode bitmap used */
324 ext3_debug("clear inode bitmap %#04x (+%ld)\n",
325 input->inode_bitmap, input->inode_bitmap - start);
326 if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
327 err = PTR_ERR(bh);
328 goto exit_journal;
329 }
330
331 mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
332 bh->b_data);
333 err = ext3_journal_dirty_metadata(handle, bh);
334exit_bh:
335 brelse(bh);
336
337exit_journal:
338 mutex_unlock(&sbi->s_resize_lock);
339 if ((err2 = ext3_journal_stop(handle)) && !err)
340 err = err2;
341
342 return err;
343}
344
345/*
346 * Iterate through the groups which hold BACKUP superblock/GDT copies in an
347 * ext3 filesystem. The counters should be initialized to 1, 5, and 7 before
348 * calling this for the first time. In a sparse filesystem it will be the
349 * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
350 * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
351 */
352static unsigned ext3_list_backups(struct super_block *sb, unsigned *three,
353 unsigned *five, unsigned *seven)
354{
355 unsigned *min = three;
356 int mult = 3;
357 unsigned ret;
358
359 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
360 EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
361 ret = *min;
362 *min += 1;
363 return ret;
364 }
365
366 if (*five < *min) {
367 min = five;
368 mult = 5;
369 }
370 if (*seven < *min) {
371 min = seven;
372 mult = 7;
373 }
374
375 ret = *min;
376 *min *= mult;
377
378 return ret;
379}
380
381/*
382 * Check that all of the backup GDT blocks are held in the primary GDT block.
383 * It is assumed that they are stored in group order. Returns the number of
384 * groups in current filesystem that have BACKUPS, or -ve error code.
385 */
386static int verify_reserved_gdb(struct super_block *sb,
387 struct buffer_head *primary)
388{
389 const ext3_fsblk_t blk = primary->b_blocknr;
390 const unsigned long end = EXT3_SB(sb)->s_groups_count;
391 unsigned three = 1;
392 unsigned five = 5;
393 unsigned seven = 7;
394 unsigned grp;
395 __le32 *p = (__le32 *)primary->b_data;
396 int gdbackups = 0;
397
398 while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
399 if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
400 ext3_warning(sb, __func__,
401 "reserved GDT "E3FSBLK
402 " missing grp %d ("E3FSBLK")",
403 blk, grp,
404 grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
405 return -EINVAL;
406 }
407 if (++gdbackups > EXT3_ADDR_PER_BLOCK(sb))
408 return -EFBIG;
409 }
410
411 return gdbackups;
412}
413
414/*
415 * Called when we need to bring a reserved group descriptor table block into
416 * use from the resize inode. The primary copy of the new GDT block currently
417 * is an indirect block (under the double indirect block in the resize inode).
418 * The new backup GDT blocks will be stored as leaf blocks in this indirect
419 * block, in group order. Even though we know all the block numbers we need,
420 * we check to ensure that the resize inode has actually reserved these blocks.
421 *
422 * Don't need to update the block bitmaps because the blocks are still in use.
423 *
424 * We get all of the error cases out of the way, so that we are sure to not
425 * fail once we start modifying the data on disk, because JBD has no rollback.
426 */
427static int add_new_gdb(handle_t *handle, struct inode *inode,
428 struct ext3_new_group_data *input,
429 struct buffer_head **primary)
430{
431 struct super_block *sb = inode->i_sb;
432 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
433 unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
434 ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
435 struct buffer_head **o_group_desc, **n_group_desc;
436 struct buffer_head *dind;
437 int gdbackups;
438 struct ext3_iloc iloc;
439 __le32 *data;
440 int err;
441
442 if (test_opt(sb, DEBUG))
443 printk(KERN_DEBUG
444 "EXT3-fs: ext3_add_new_gdb: adding group block %lu\n",
445 gdb_num);
446
447 /*
448 * If we are not using the primary superblock/GDT copy don't resize,
449 * because the user tools have no way of handling this. Probably a
450 * bad time to do it anyways.
451 */
452 if (EXT3_SB(sb)->s_sbh->b_blocknr !=
453 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) {
454 ext3_warning(sb, __func__,
455 "won't resize using backup superblock at %llu",
456 (unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr);
457 return -EPERM;
458 }
459
460 *primary = sb_bread(sb, gdblock);
461 if (!*primary)
462 return -EIO;
463
464 if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
465 err = gdbackups;
466 goto exit_bh;
467 }
468
469 data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK;
470 dind = sb_bread(sb, le32_to_cpu(*data));
471 if (!dind) {
472 err = -EIO;
473 goto exit_bh;
474 }
475
476 data = (__le32 *)dind->b_data;
477 if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
478 ext3_warning(sb, __func__,
479 "new group %u GDT block "E3FSBLK" not reserved",
480 input->group, gdblock);
481 err = -EINVAL;
482 goto exit_dind;
483 }
484
485 if ((err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh)))
486 goto exit_dind;
487
488 if ((err = ext3_journal_get_write_access(handle, *primary)))
489 goto exit_sbh;
490
491 if ((err = ext3_journal_get_write_access(handle, dind)))
492 goto exit_primary;
493
494 /* ext3_reserve_inode_write() gets a reference on the iloc */
495 if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
496 goto exit_dindj;
497
498 n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
499 GFP_NOFS);
500 if (!n_group_desc) {
501 err = -ENOMEM;
502 ext3_warning (sb, __func__,
503 "not enough memory for %lu groups", gdb_num + 1);
504 goto exit_inode;
505 }
506
507 /*
508 * Finally, we have all of the possible failures behind us...
509 *
510 * Remove new GDT block from inode double-indirect block and clear out
511 * the new GDT block for use (which also "frees" the backup GDT blocks
512 * from the reserved inode). We don't need to change the bitmaps for
513 * these blocks, because they are marked as in-use from being in the
514 * reserved inode, and will become GDT blocks (primary and backup).
515 */
516 data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
517 err = ext3_journal_dirty_metadata(handle, dind);
518 if (err)
519 goto exit_group_desc;
520 brelse(dind);
521 dind = NULL;
522 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
523 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
524 if (err)
525 goto exit_group_desc;
526 memset((*primary)->b_data, 0, sb->s_blocksize);
527 err = ext3_journal_dirty_metadata(handle, *primary);
528 if (err)
529 goto exit_group_desc;
530
531 o_group_desc = EXT3_SB(sb)->s_group_desc;
532 memcpy(n_group_desc, o_group_desc,
533 EXT3_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
534 n_group_desc[gdb_num] = *primary;
535 EXT3_SB(sb)->s_group_desc = n_group_desc;
536 EXT3_SB(sb)->s_gdb_count++;
537 kfree(o_group_desc);
538
539 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
540 err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
541 if (err)
542 goto exit_inode;
543
544 return 0;
545
546exit_group_desc:
547 kfree(n_group_desc);
548exit_inode:
549 //ext3_journal_release_buffer(handle, iloc.bh);
550 brelse(iloc.bh);
551exit_dindj:
552 //ext3_journal_release_buffer(handle, dind);
553exit_primary:
554 //ext3_journal_release_buffer(handle, *primary);
555exit_sbh:
556 //ext3_journal_release_buffer(handle, *primary);
557exit_dind:
558 brelse(dind);
559exit_bh:
560 brelse(*primary);
561
562 ext3_debug("leaving with error %d\n", err);
563 return err;
564}
565
566/*
567 * Called when we are adding a new group which has a backup copy of each of
568 * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
569 * We need to add these reserved backup GDT blocks to the resize inode, so
570 * that they are kept for future resizing and not allocated to files.
571 *
572 * Each reserved backup GDT block will go into a different indirect block.
573 * The indirect blocks are actually the primary reserved GDT blocks,
574 * so we know in advance what their block numbers are. We only get the
575 * double-indirect block to verify it is pointing to the primary reserved
576 * GDT blocks so we don't overwrite a data block by accident. The reserved
577 * backup GDT blocks are stored in their reserved primary GDT block.
578 */
579static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
580 struct ext3_new_group_data *input)
581{
582 struct super_block *sb = inode->i_sb;
583 int reserved_gdb =le16_to_cpu(EXT3_SB(sb)->s_es->s_reserved_gdt_blocks);
584 struct buffer_head **primary;
585 struct buffer_head *dind;
586 struct ext3_iloc iloc;
587 ext3_fsblk_t blk;
588 __le32 *data, *end;
589 int gdbackups = 0;
590 int res, i;
591 int err;
592
593 primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS);
594 if (!primary)
595 return -ENOMEM;
596
597 data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK;
598 dind = sb_bread(sb, le32_to_cpu(*data));
599 if (!dind) {
600 err = -EIO;
601 goto exit_free;
602 }
603
604 blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count;
605 data = (__le32 *)dind->b_data + (EXT3_SB(sb)->s_gdb_count %
606 EXT3_ADDR_PER_BLOCK(sb));
607 end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
608
609 /* Get each reserved primary GDT block and verify it holds backups */
610 for (res = 0; res < reserved_gdb; res++, blk++) {
611 if (le32_to_cpu(*data) != blk) {
612 ext3_warning(sb, __func__,
613 "reserved block "E3FSBLK
614 " not at offset %ld",
615 blk,
616 (long)(data - (__le32 *)dind->b_data));
617 err = -EINVAL;
618 goto exit_bh;
619 }
620 primary[res] = sb_bread(sb, blk);
621 if (!primary[res]) {
622 err = -EIO;
623 goto exit_bh;
624 }
625 if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
626 brelse(primary[res]);
627 err = gdbackups;
628 goto exit_bh;
629 }
630 if (++data >= end)
631 data = (__le32 *)dind->b_data;
632 }
633
634 for (i = 0; i < reserved_gdb; i++) {
635 if ((err = ext3_journal_get_write_access(handle, primary[i]))) {
636 /*
637 int j;
638 for (j = 0; j < i; j++)
639 ext3_journal_release_buffer(handle, primary[j]);
640 */
641 goto exit_bh;
642 }
643 }
644
645 if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
646 goto exit_bh;
647
648 /*
649 * Finally we can add each of the reserved backup GDT blocks from
650 * the new group to its reserved primary GDT block.
651 */
652 blk = input->group * EXT3_BLOCKS_PER_GROUP(sb);
653 for (i = 0; i < reserved_gdb; i++) {
654 int err2;
655 data = (__le32 *)primary[i]->b_data;
656 /* printk("reserving backup %lu[%u] = %lu\n",
657 primary[i]->b_blocknr, gdbackups,
658 blk + primary[i]->b_blocknr); */
659 data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
660 err2 = ext3_journal_dirty_metadata(handle, primary[i]);
661 if (!err)
662 err = err2;
663 }
664 inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
665 ext3_mark_iloc_dirty(handle, inode, &iloc);
666
667exit_bh:
668 while (--res >= 0)
669 brelse(primary[res]);
670 brelse(dind);
671
672exit_free:
673 kfree(primary);
674
675 return err;
676}
677
678/*
679 * Update the backup copies of the ext3 metadata. These don't need to be part
680 * of the main resize transaction, because e2fsck will re-write them if there
681 * is a problem (basically only OOM will cause a problem). However, we
682 * _should_ update the backups if possible, in case the primary gets trashed
683 * for some reason and we need to run e2fsck from a backup superblock. The
684 * important part is that the new block and inode counts are in the backup
685 * superblocks, and the location of the new group metadata in the GDT backups.
686 *
687 * We do not need take the s_resize_lock for this, because these
688 * blocks are not otherwise touched by the filesystem code when it is
689 * mounted. We don't need to worry about last changing from
690 * sbi->s_groups_count, because the worst that can happen is that we
691 * do not copy the full number of backups at this time. The resize
692 * which changed s_groups_count will backup again.
693 */
694static void update_backups(struct super_block *sb,
695 int blk_off, char *data, int size)
696{
697 struct ext3_sb_info *sbi = EXT3_SB(sb);
698 const unsigned long last = sbi->s_groups_count;
699 const int bpg = EXT3_BLOCKS_PER_GROUP(sb);
700 unsigned three = 1;
701 unsigned five = 5;
702 unsigned seven = 7;
703 unsigned group;
704 int rest = sb->s_blocksize - size;
705 handle_t *handle;
706 int err = 0, err2;
707
708 handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA);
709 if (IS_ERR(handle)) {
710 group = 1;
711 err = PTR_ERR(handle);
712 goto exit_err;
713 }
714
715 while ((group = ext3_list_backups(sb, &three, &five, &seven)) < last) {
716 struct buffer_head *bh;
717
718 /* Out of journal space, and can't get more - abort - so sad */
719 if (handle->h_buffer_credits == 0 &&
720 ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA) &&
721 (err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA)))
722 break;
723
724 bh = sb_getblk(sb, group * bpg + blk_off);
725 if (unlikely(!bh)) {
726 err = -ENOMEM;
727 break;
728 }
729 ext3_debug("update metadata backup %#04lx\n",
730 (unsigned long)bh->b_blocknr);
731 if ((err = ext3_journal_get_write_access(handle, bh))) {
732 brelse(bh);
733 break;
734 }
735 lock_buffer(bh);
736 memcpy(bh->b_data, data, size);
737 if (rest)
738 memset(bh->b_data + size, 0, rest);
739 set_buffer_uptodate(bh);
740 unlock_buffer(bh);
741 err = ext3_journal_dirty_metadata(handle, bh);
742 brelse(bh);
743 if (err)
744 break;
745 }
746 if ((err2 = ext3_journal_stop(handle)) && !err)
747 err = err2;
748
749 /*
750 * Ugh! Need to have e2fsck write the backup copies. It is too
751 * late to revert the resize, we shouldn't fail just because of
752 * the backup copies (they are only needed in case of corruption).
753 *
754 * However, if we got here we have a journal problem too, so we
755 * can't really start a transaction to mark the superblock.
756 * Chicken out and just set the flag on the hope it will be written
757 * to disk, and if not - we will simply wait until next fsck.
758 */
759exit_err:
760 if (err) {
761 ext3_warning(sb, __func__,
762 "can't update backup for group %d (err %d), "
763 "forcing fsck on next reboot", group, err);
764 sbi->s_mount_state &= ~EXT3_VALID_FS;
765 sbi->s_es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
766 mark_buffer_dirty(sbi->s_sbh);
767 }
768}
769
770/* Add group descriptor data to an existing or new group descriptor block.
771 * Ensure we handle all possible error conditions _before_ we start modifying
772 * the filesystem, because we cannot abort the transaction and not have it
773 * write the data to disk.
774 *
775 * If we are on a GDT block boundary, we need to get the reserved GDT block.
776 * Otherwise, we may need to add backup GDT blocks for a sparse group.
777 *
778 * We only need to hold the superblock lock while we are actually adding
779 * in the new group's counts to the superblock. Prior to that we have
780 * not really "added" the group at all. We re-check that we are still
781 * adding in the last group in case things have changed since verifying.
782 */
783int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
784{
785 struct ext3_sb_info *sbi = EXT3_SB(sb);
786 struct ext3_super_block *es = sbi->s_es;
787 int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
788 le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
789 struct buffer_head *primary = NULL;
790 struct ext3_group_desc *gdp;
791 struct inode *inode = NULL;
792 handle_t *handle;
793 int gdb_off, gdb_num;
794 int err, err2;
795
796 gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
797 gdb_off = input->group % EXT3_DESC_PER_BLOCK(sb);
798
799 if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb,
800 EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
801 ext3_warning(sb, __func__,
802 "Can't resize non-sparse filesystem further");
803 return -EPERM;
804 }
805
806 if (le32_to_cpu(es->s_blocks_count) + input->blocks_count <
807 le32_to_cpu(es->s_blocks_count)) {
808 ext3_warning(sb, __func__, "blocks_count overflow\n");
809 return -EINVAL;
810 }
811
812 if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) <
813 le32_to_cpu(es->s_inodes_count)) {
814 ext3_warning(sb, __func__, "inodes_count overflow\n");
815 return -EINVAL;
816 }
817
818 if (reserved_gdb || gdb_off == 0) {
819 if (!EXT3_HAS_COMPAT_FEATURE(sb,
820 EXT3_FEATURE_COMPAT_RESIZE_INODE)
821 || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
822 ext3_warning(sb, __func__,
823 "No reserved GDT blocks, can't resize");
824 return -EPERM;
825 }
826 inode = ext3_iget(sb, EXT3_RESIZE_INO);
827 if (IS_ERR(inode)) {
828 ext3_warning(sb, __func__,
829 "Error opening resize inode");
830 return PTR_ERR(inode);
831 }
832 }
833
834 if ((err = verify_group_input(sb, input)))
835 goto exit_put;
836
837 if ((err = setup_new_group_blocks(sb, input)))
838 goto exit_put;
839
840 /*
841 * We will always be modifying at least the superblock and a GDT
842 * block. If we are adding a group past the last current GDT block,
843 * we will also modify the inode and the dindirect block. If we
844 * are adding a group with superblock/GDT backups we will also
845 * modify each of the reserved GDT dindirect blocks.
846 */
847 handle = ext3_journal_start_sb(sb,
848 ext3_bg_has_super(sb, input->group) ?
849 3 + reserved_gdb : 4);
850 if (IS_ERR(handle)) {
851 err = PTR_ERR(handle);
852 goto exit_put;
853 }
854
855 mutex_lock(&sbi->s_resize_lock);
856 if (input->group != sbi->s_groups_count) {
857 ext3_warning(sb, __func__,
858 "multiple resizers run on filesystem!");
859 err = -EBUSY;
860 goto exit_journal;
861 }
862
863 if ((err = ext3_journal_get_write_access(handle, sbi->s_sbh)))
864 goto exit_journal;
865
866 /*
867 * We will only either add reserved group blocks to a backup group
868 * or remove reserved blocks for the first group in a new group block.
869 * Doing both would be mean more complex code, and sane people don't
870 * use non-sparse filesystems anymore. This is already checked above.
871 */
872 if (gdb_off) {
873 primary = sbi->s_group_desc[gdb_num];
874 if ((err = ext3_journal_get_write_access(handle, primary)))
875 goto exit_journal;
876
877 if (reserved_gdb && ext3_bg_num_gdb(sb, input->group) &&
878 (err = reserve_backup_gdb(handle, inode, input)))
879 goto exit_journal;
880 } else if ((err = add_new_gdb(handle, inode, input, &primary)))
881 goto exit_journal;
882
883 /*
884 * OK, now we've set up the new group. Time to make it active.
885 *
886 * We do not lock all allocations via s_resize_lock
887 * so we have to be safe wrt. concurrent accesses the group
888 * data. So we need to be careful to set all of the relevant
889 * group descriptor data etc. *before* we enable the group.
890 *
891 * The key field here is sbi->s_groups_count: as long as
892 * that retains its old value, nobody is going to access the new
893 * group.
894 *
895 * So first we update all the descriptor metadata for the new
896 * group; then we update the total disk blocks count; then we
897 * update the groups count to enable the group; then finally we
898 * update the free space counts so that the system can start
899 * using the new disk blocks.
900 */
901
902 /* Update group descriptor block for new group */
903 gdp = (struct ext3_group_desc *)primary->b_data + gdb_off;
904
905 gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap);
906 gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap);
907 gdp->bg_inode_table = cpu_to_le32(input->inode_table);
908 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
909 gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb));
910
911 /*
912 * Make the new blocks and inodes valid next. We do this before
913 * increasing the group count so that once the group is enabled,
914 * all of its blocks and inodes are already valid.
915 *
916 * We always allocate group-by-group, then block-by-block or
917 * inode-by-inode within a group, so enabling these
918 * blocks/inodes before the group is live won't actually let us
919 * allocate the new space yet.
920 */
921 le32_add_cpu(&es->s_blocks_count, input->blocks_count);
922 le32_add_cpu(&es->s_inodes_count, EXT3_INODES_PER_GROUP(sb));
923
924 /*
925 * We need to protect s_groups_count against other CPUs seeing
926 * inconsistent state in the superblock.
927 *
928 * The precise rules we use are:
929 *
930 * * Writers of s_groups_count *must* hold s_resize_lock
931 * AND
932 * * Writers must perform a smp_wmb() after updating all dependent
933 * data and before modifying the groups count
934 *
935 * * Readers must hold s_resize_lock over the access
936 * OR
937 * * Readers must perform an smp_rmb() after reading the groups count
938 * and before reading any dependent data.
939 *
940 * NB. These rules can be relaxed when checking the group count
941 * while freeing data, as we can only allocate from a block
942 * group after serialising against the group count, and we can
943 * only then free after serialising in turn against that
944 * allocation.
945 */
946 smp_wmb();
947
948 /* Update the global fs size fields */
949 sbi->s_groups_count++;
950
951 err = ext3_journal_dirty_metadata(handle, primary);
952 if (err)
953 goto exit_journal;
954
955 /* Update the reserved block counts only once the new group is
956 * active. */
957 le32_add_cpu(&es->s_r_blocks_count, input->reserved_blocks);
958
959 /* Update the free space counts */
960 percpu_counter_add(&sbi->s_freeblocks_counter,
961 input->free_blocks_count);
962 percpu_counter_add(&sbi->s_freeinodes_counter,
963 EXT3_INODES_PER_GROUP(sb));
964
965 err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
966
967exit_journal:
968 mutex_unlock(&sbi->s_resize_lock);
969 if ((err2 = ext3_journal_stop(handle)) && !err)
970 err = err2;
971 if (!err) {
972 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
973 sizeof(struct ext3_super_block));
974 update_backups(sb, primary->b_blocknr, primary->b_data,
975 primary->b_size);
976 }
977exit_put:
978 iput(inode);
979 return err;
980} /* ext3_group_add */
981
982/* Extend the filesystem to the new number of blocks specified. This entry
983 * point is only used to extend the current filesystem to the end of the last
984 * existing group. It can be accessed via ioctl, or by "remount,resize=<size>"
985 * for emergencies (because it has no dependencies on reserved blocks).
986 *
987 * If we _really_ wanted, we could use default values to call ext3_group_add()
988 * allow the "remount" trick to work for arbitrary resizing, assuming enough
989 * GDT blocks are reserved to grow to the desired size.
990 */
991int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
992 ext3_fsblk_t n_blocks_count)
993{
994 ext3_fsblk_t o_blocks_count;
995 ext3_grpblk_t last;
996 ext3_grpblk_t add;
997 struct buffer_head * bh;
998 handle_t *handle;
999 int err;
1000 unsigned long freed_blocks;
1001
1002 /* We don't need to worry about locking wrt other resizers just
1003 * yet: we're going to revalidate es->s_blocks_count after
1004 * taking the s_resize_lock below. */
1005 o_blocks_count = le32_to_cpu(es->s_blocks_count);
1006
1007 if (test_opt(sb, DEBUG))
1008 printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
1009 " up to "E3FSBLK" blocks\n",
1010 o_blocks_count, n_blocks_count);
1011
1012 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
1013 return 0;
1014
1015 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
1016 printk(KERN_ERR "EXT3-fs: filesystem on %s:"
1017 " too large to resize to "E3FSBLK" blocks safely\n",
1018 sb->s_id, n_blocks_count);
1019 if (sizeof(sector_t) < 8)
1020 ext3_warning(sb, __func__,
1021 "CONFIG_LBDAF not enabled\n");
1022 return -EINVAL;
1023 }
1024
1025 if (n_blocks_count < o_blocks_count) {
1026 ext3_warning(sb, __func__,
1027 "can't shrink FS - resize aborted");
1028 return -EBUSY;
1029 }
1030
1031 /* Handle the remaining blocks in the last group only. */
1032 last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) %
1033 EXT3_BLOCKS_PER_GROUP(sb);
1034
1035 if (last == 0) {
1036 ext3_warning(sb, __func__,
1037 "need to use ext2online to resize further");
1038 return -EPERM;
1039 }
1040
1041 add = EXT3_BLOCKS_PER_GROUP(sb) - last;
1042
1043 if (o_blocks_count + add < o_blocks_count) {
1044 ext3_warning(sb, __func__, "blocks_count overflow");
1045 return -EINVAL;
1046 }
1047
1048 if (o_blocks_count + add > n_blocks_count)
1049 add = n_blocks_count - o_blocks_count;
1050
1051 if (o_blocks_count + add < n_blocks_count)
1052 ext3_warning(sb, __func__,
1053 "will only finish group ("E3FSBLK
1054 " blocks, %u new)",
1055 o_blocks_count + add, add);
1056
1057 /* See if the device is actually as big as what was requested */
1058 bh = sb_bread(sb, o_blocks_count + add -1);
1059 if (!bh) {
1060 ext3_warning(sb, __func__,
1061 "can't read last block, resize aborted");
1062 return -ENOSPC;
1063 }
1064 brelse(bh);
1065
1066 /* We will update the superblock, one block bitmap, and
1067 * one group descriptor via ext3_free_blocks().
1068 */
1069 handle = ext3_journal_start_sb(sb, 3);
1070 if (IS_ERR(handle)) {
1071 err = PTR_ERR(handle);
1072 ext3_warning(sb, __func__, "error %d on journal start",err);
1073 goto exit_put;
1074 }
1075
1076 mutex_lock(&EXT3_SB(sb)->s_resize_lock);
1077 if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
1078 ext3_warning(sb, __func__,
1079 "multiple resizers run on filesystem!");
1080 mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
1081 ext3_journal_stop(handle);
1082 err = -EBUSY;
1083 goto exit_put;
1084 }
1085
1086 if ((err = ext3_journal_get_write_access(handle,
1087 EXT3_SB(sb)->s_sbh))) {
1088 ext3_warning(sb, __func__,
1089 "error %d on journal write access", err);
1090 mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
1091 ext3_journal_stop(handle);
1092 goto exit_put;
1093 }
1094 es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
1095 err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
1096 mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
1097 if (err) {
1098 ext3_warning(sb, __func__,
1099 "error %d on journal dirty metadata", err);
1100 ext3_journal_stop(handle);
1101 goto exit_put;
1102 }
1103 ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
1104 o_blocks_count, o_blocks_count + add);
1105 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
1106 ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n",
1107 o_blocks_count, o_blocks_count + add);
1108 if ((err = ext3_journal_stop(handle)))
1109 goto exit_put;
1110 if (test_opt(sb, DEBUG))
1111 printk(KERN_DEBUG "EXT3-fs: extended group to %u blocks\n",
1112 le32_to_cpu(es->s_blocks_count));
1113 update_backups(sb, EXT3_SB(sb)->s_sbh->b_blocknr, (char *)es,
1114 sizeof(struct ext3_super_block));
1115exit_put:
1116 return err;
1117} /* ext3_group_extend */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
deleted file mode 100644
index 5ed0044fbb37..000000000000
--- a/fs/ext3/super.c
+++ /dev/null
@@ -1,3165 +0,0 @@
1/*
2 * linux/fs/ext3/super.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Big-endian to little-endian byte-swapping/bitmaps by
16 * David S. Miller (davem@caip.rutgers.edu), 1995
17 */
18
19#include <linux/module.h>
20#include <linux/blkdev.h>
21#include <linux/parser.h>
22#include <linux/exportfs.h>
23#include <linux/statfs.h>
24#include <linux/random.h>
25#include <linux/mount.h>
26#include <linux/quotaops.h>
27#include <linux/seq_file.h>
28#include <linux/log2.h>
29#include <linux/cleancache.h>
30#include <linux/namei.h>
31
32#include <asm/uaccess.h>
33
34#define CREATE_TRACE_POINTS
35
36#include "ext3.h"
37#include "xattr.h"
38#include "acl.h"
39#include "namei.h"
40
41#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
42 #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
43#else
44 #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
45#endif
46
47static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
48 unsigned long journal_devnum);
49static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
50 unsigned int);
51static int ext3_commit_super(struct super_block *sb,
52 struct ext3_super_block *es,
53 int sync);
54static void ext3_mark_recovery_complete(struct super_block * sb,
55 struct ext3_super_block * es);
56static void ext3_clear_journal_err(struct super_block * sb,
57 struct ext3_super_block * es);
58static int ext3_sync_fs(struct super_block *sb, int wait);
59static const char *ext3_decode_error(struct super_block * sb, int errno,
60 char nbuf[16]);
61static int ext3_remount (struct super_block * sb, int * flags, char * data);
62static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
63static int ext3_unfreeze(struct super_block *sb);
64static int ext3_freeze(struct super_block *sb);
65
66/*
67 * Wrappers for journal_start/end.
68 */
69handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
70{
71 journal_t *journal;
72
73 if (sb->s_flags & MS_RDONLY)
74 return ERR_PTR(-EROFS);
75
76 /* Special case here: if the journal has aborted behind our
77 * backs (eg. EIO in the commit thread), then we still need to
78 * take the FS itself readonly cleanly. */
79 journal = EXT3_SB(sb)->s_journal;
80 if (is_journal_aborted(journal)) {
81 ext3_abort(sb, __func__,
82 "Detected aborted journal");
83 return ERR_PTR(-EROFS);
84 }
85
86 return journal_start(journal, nblocks);
87}
88
89int __ext3_journal_stop(const char *where, handle_t *handle)
90{
91 struct super_block *sb;
92 int err;
93 int rc;
94
95 sb = handle->h_transaction->t_journal->j_private;
96 err = handle->h_err;
97 rc = journal_stop(handle);
98
99 if (!err)
100 err = rc;
101 if (err)
102 __ext3_std_error(sb, where, err);
103 return err;
104}
105
106void ext3_journal_abort_handle(const char *caller, const char *err_fn,
107 struct buffer_head *bh, handle_t *handle, int err)
108{
109 char nbuf[16];
110 const char *errstr = ext3_decode_error(NULL, err, nbuf);
111
112 if (bh)
113 BUFFER_TRACE(bh, "abort");
114
115 if (!handle->h_err)
116 handle->h_err = err;
117
118 if (is_handle_aborted(handle))
119 return;
120
121 printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n",
122 caller, errstr, err_fn);
123
124 journal_abort_handle(handle);
125}
126
127void ext3_msg(struct super_block *sb, const char *prefix,
128 const char *fmt, ...)
129{
130 struct va_format vaf;
131 va_list args;
132
133 va_start(args, fmt);
134
135 vaf.fmt = fmt;
136 vaf.va = &args;
137
138 printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
139
140 va_end(args);
141}
142
143/* Deal with the reporting of failure conditions on a filesystem such as
144 * inconsistencies detected or read IO failures.
145 *
146 * On ext2, we can store the error state of the filesystem in the
147 * superblock. That is not possible on ext3, because we may have other
148 * write ordering constraints on the superblock which prevent us from
149 * writing it out straight away; and given that the journal is about to
150 * be aborted, we can't rely on the current, or future, transactions to
151 * write out the superblock safely.
152 *
153 * We'll just use the journal_abort() error code to record an error in
154 * the journal instead. On recovery, the journal will complain about
155 * that error until we've noted it down and cleared it.
156 */
157
158static void ext3_handle_error(struct super_block *sb)
159{
160 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
161
162 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
163 es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
164
165 if (sb->s_flags & MS_RDONLY)
166 return;
167
168 if (!test_opt (sb, ERRORS_CONT)) {
169 journal_t *journal = EXT3_SB(sb)->s_journal;
170
171 set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
172 if (journal)
173 journal_abort(journal, -EIO);
174 }
175 if (test_opt (sb, ERRORS_RO)) {
176 ext3_msg(sb, KERN_CRIT,
177 "error: remounting filesystem read-only");
178 /*
179 * Make sure updated value of ->s_mount_state will be visible
180 * before ->s_flags update.
181 */
182 smp_wmb();
183 sb->s_flags |= MS_RDONLY;
184 }
185 ext3_commit_super(sb, es, 1);
186 if (test_opt(sb, ERRORS_PANIC))
187 panic("EXT3-fs (%s): panic forced after error\n",
188 sb->s_id);
189}
190
191void ext3_error(struct super_block *sb, const char *function,
192 const char *fmt, ...)
193{
194 struct va_format vaf;
195 va_list args;
196
197 va_start(args, fmt);
198
199 vaf.fmt = fmt;
200 vaf.va = &args;
201
202 printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
203 sb->s_id, function, &vaf);
204
205 va_end(args);
206
207 ext3_handle_error(sb);
208}
209
210static const char *ext3_decode_error(struct super_block * sb, int errno,
211 char nbuf[16])
212{
213 char *errstr = NULL;
214
215 switch (errno) {
216 case -EIO:
217 errstr = "IO failure";
218 break;
219 case -ENOMEM:
220 errstr = "Out of memory";
221 break;
222 case -EROFS:
223 if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT)
224 errstr = "Journal has aborted";
225 else
226 errstr = "Readonly filesystem";
227 break;
228 default:
229 /* If the caller passed in an extra buffer for unknown
230 * errors, textualise them now. Else we just return
231 * NULL. */
232 if (nbuf) {
233 /* Check for truncated error codes... */
234 if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
235 errstr = nbuf;
236 }
237 break;
238 }
239
240 return errstr;
241}
242
243/* __ext3_std_error decodes expected errors from journaling functions
244 * automatically and invokes the appropriate error response. */
245
246void __ext3_std_error (struct super_block * sb, const char * function,
247 int errno)
248{
249 char nbuf[16];
250 const char *errstr;
251
252 /* Special case: if the error is EROFS, and we're not already
253 * inside a transaction, then there's really no point in logging
254 * an error. */
255 if (errno == -EROFS && journal_current_handle() == NULL &&
256 (sb->s_flags & MS_RDONLY))
257 return;
258
259 errstr = ext3_decode_error(sb, errno, nbuf);
260 ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr);
261
262 ext3_handle_error(sb);
263}
264
265/*
266 * ext3_abort is a much stronger failure handler than ext3_error. The
267 * abort function may be used to deal with unrecoverable failures such
268 * as journal IO errors or ENOMEM at a critical moment in log management.
269 *
270 * We unconditionally force the filesystem into an ABORT|READONLY state,
271 * unless the error response on the fs has been set to panic in which
272 * case we take the easy way out and panic immediately.
273 */
274
275void ext3_abort(struct super_block *sb, const char *function,
276 const char *fmt, ...)
277{
278 struct va_format vaf;
279 va_list args;
280
281 va_start(args, fmt);
282
283 vaf.fmt = fmt;
284 vaf.va = &args;
285
286 printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
287 sb->s_id, function, &vaf);
288
289 va_end(args);
290
291 if (test_opt(sb, ERRORS_PANIC))
292 panic("EXT3-fs: panic from previous error\n");
293
294 if (sb->s_flags & MS_RDONLY)
295 return;
296
297 ext3_msg(sb, KERN_CRIT,
298 "error: remounting filesystem read-only");
299 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
300 set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
301 /*
302 * Make sure updated value of ->s_mount_state will be visible
303 * before ->s_flags update.
304 */
305 smp_wmb();
306 sb->s_flags |= MS_RDONLY;
307
308 if (EXT3_SB(sb)->s_journal)
309 journal_abort(EXT3_SB(sb)->s_journal, -EIO);
310}
311
312void ext3_warning(struct super_block *sb, const char *function,
313 const char *fmt, ...)
314{
315 struct va_format vaf;
316 va_list args;
317
318 va_start(args, fmt);
319
320 vaf.fmt = fmt;
321 vaf.va = &args;
322
323 printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
324 sb->s_id, function, &vaf);
325
326 va_end(args);
327}
328
329void ext3_update_dynamic_rev(struct super_block *sb)
330{
331 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
332
333 if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
334 return;
335
336 ext3_msg(sb, KERN_WARNING,
337 "warning: updating to rev %d because of "
338 "new feature flag, running e2fsck is recommended",
339 EXT3_DYNAMIC_REV);
340
341 es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
342 es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
343 es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV);
344 /* leave es->s_feature_*compat flags alone */
345 /* es->s_uuid will be set by e2fsck if empty */
346
347 /*
348 * The rest of the superblock fields should be zero, and if not it
349 * means they are likely already in use, so leave them alone. We
350 * can leave it up to e2fsck to clean up any inconsistencies there.
351 */
352}
353
354/*
355 * Open the external journal device
356 */
357static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
358{
359 struct block_device *bdev;
360 char b[BDEVNAME_SIZE];
361
362 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
363 if (IS_ERR(bdev))
364 goto fail;
365 return bdev;
366
367fail:
368 ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld",
369 __bdevname(dev, b), PTR_ERR(bdev));
370
371 return NULL;
372}
373
374/*
375 * Release the journal device
376 */
377static void ext3_blkdev_put(struct block_device *bdev)
378{
379 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
380}
381
382static void ext3_blkdev_remove(struct ext3_sb_info *sbi)
383{
384 struct block_device *bdev;
385 bdev = sbi->journal_bdev;
386 if (bdev) {
387 ext3_blkdev_put(bdev);
388 sbi->journal_bdev = NULL;
389 }
390}
391
392static inline struct inode *orphan_list_entry(struct list_head *l)
393{
394 return &list_entry(l, struct ext3_inode_info, i_orphan)->vfs_inode;
395}
396
397static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
398{
399 struct list_head *l;
400
401 ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d",
402 le32_to_cpu(sbi->s_es->s_last_orphan));
403
404 ext3_msg(sb, KERN_ERR, "sb_info orphan list:");
405 list_for_each(l, &sbi->s_orphan) {
406 struct inode *inode = orphan_list_entry(l);
407 ext3_msg(sb, KERN_ERR, " "
408 "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
409 inode->i_sb->s_id, inode->i_ino, inode,
410 inode->i_mode, inode->i_nlink,
411 NEXT_ORPHAN(inode));
412 }
413}
414
415static void ext3_put_super (struct super_block * sb)
416{
417 struct ext3_sb_info *sbi = EXT3_SB(sb);
418 struct ext3_super_block *es = sbi->s_es;
419 int i, err;
420
421 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
422 ext3_xattr_put_super(sb);
423 err = journal_destroy(sbi->s_journal);
424 sbi->s_journal = NULL;
425 if (err < 0)
426 ext3_abort(sb, __func__, "Couldn't clean up the journal");
427
428 if (!(sb->s_flags & MS_RDONLY)) {
429 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
430 es->s_state = cpu_to_le16(sbi->s_mount_state);
431 BUFFER_TRACE(sbi->s_sbh, "marking dirty");
432 mark_buffer_dirty(sbi->s_sbh);
433 ext3_commit_super(sb, es, 1);
434 }
435
436 for (i = 0; i < sbi->s_gdb_count; i++)
437 brelse(sbi->s_group_desc[i]);
438 kfree(sbi->s_group_desc);
439 percpu_counter_destroy(&sbi->s_freeblocks_counter);
440 percpu_counter_destroy(&sbi->s_freeinodes_counter);
441 percpu_counter_destroy(&sbi->s_dirs_counter);
442 brelse(sbi->s_sbh);
443#ifdef CONFIG_QUOTA
444 for (i = 0; i < EXT3_MAXQUOTAS; i++)
445 kfree(sbi->s_qf_names[i]);
446#endif
447
448 /* Debugging code just in case the in-memory inode orphan list
449 * isn't empty. The on-disk one can be non-empty if we've
450 * detected an error and taken the fs readonly, but the
451 * in-memory list had better be clean by this point. */
452 if (!list_empty(&sbi->s_orphan))
453 dump_orphan_list(sb, sbi);
454 J_ASSERT(list_empty(&sbi->s_orphan));
455
456 invalidate_bdev(sb->s_bdev);
457 if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
458 /*
459 * Invalidate the journal device's buffers. We don't want them
460 * floating about in memory - the physical journal device may
461 * hotswapped, and it breaks the `ro-after' testing code.
462 */
463 sync_blockdev(sbi->journal_bdev);
464 invalidate_bdev(sbi->journal_bdev);
465 ext3_blkdev_remove(sbi);
466 }
467 sb->s_fs_info = NULL;
468 kfree(sbi->s_blockgroup_lock);
469 mutex_destroy(&sbi->s_orphan_lock);
470 mutex_destroy(&sbi->s_resize_lock);
471 kfree(sbi);
472}
473
474static struct kmem_cache *ext3_inode_cachep;
475
476/*
477 * Called inside transaction, so use GFP_NOFS
478 */
479static struct inode *ext3_alloc_inode(struct super_block *sb)
480{
481 struct ext3_inode_info *ei;
482
483 ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS);
484 if (!ei)
485 return NULL;
486 ei->i_block_alloc_info = NULL;
487 ei->vfs_inode.i_version = 1;
488 atomic_set(&ei->i_datasync_tid, 0);
489 atomic_set(&ei->i_sync_tid, 0);
490#ifdef CONFIG_QUOTA
491 memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
492#endif
493
494 return &ei->vfs_inode;
495}
496
497static int ext3_drop_inode(struct inode *inode)
498{
499 int drop = generic_drop_inode(inode);
500
501 trace_ext3_drop_inode(inode, drop);
502 return drop;
503}
504
505static void ext3_i_callback(struct rcu_head *head)
506{
507 struct inode *inode = container_of(head, struct inode, i_rcu);
508 kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
509}
510
511static void ext3_destroy_inode(struct inode *inode)
512{
513 if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
514 printk("EXT3 Inode %p: orphan list check failed!\n",
515 EXT3_I(inode));
516 print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
517 EXT3_I(inode), sizeof(struct ext3_inode_info),
518 false);
519 dump_stack();
520 }
521 call_rcu(&inode->i_rcu, ext3_i_callback);
522}
523
524static void init_once(void *foo)
525{
526 struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
527
528 INIT_LIST_HEAD(&ei->i_orphan);
529#ifdef CONFIG_EXT3_FS_XATTR
530 init_rwsem(&ei->xattr_sem);
531#endif
532 mutex_init(&ei->truncate_mutex);
533 inode_init_once(&ei->vfs_inode);
534}
535
536static int __init init_inodecache(void)
537{
538 ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
539 sizeof(struct ext3_inode_info),
540 0, (SLAB_RECLAIM_ACCOUNT|
541 SLAB_MEM_SPREAD),
542 init_once);
543 if (ext3_inode_cachep == NULL)
544 return -ENOMEM;
545 return 0;
546}
547
548static void destroy_inodecache(void)
549{
550 /*
551 * Make sure all delayed rcu free inodes are flushed before we
552 * destroy cache.
553 */
554 rcu_barrier();
555 kmem_cache_destroy(ext3_inode_cachep);
556}
557
558static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
559{
560#if defined(CONFIG_QUOTA)
561 struct ext3_sb_info *sbi = EXT3_SB(sb);
562
563 if (sbi->s_jquota_fmt) {
564 char *fmtname = "";
565
566 switch (sbi->s_jquota_fmt) {
567 case QFMT_VFS_OLD:
568 fmtname = "vfsold";
569 break;
570 case QFMT_VFS_V0:
571 fmtname = "vfsv0";
572 break;
573 case QFMT_VFS_V1:
574 fmtname = "vfsv1";
575 break;
576 }
577 seq_printf(seq, ",jqfmt=%s", fmtname);
578 }
579
580 if (sbi->s_qf_names[USRQUOTA])
581 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
582
583 if (sbi->s_qf_names[GRPQUOTA])
584 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
585
586 if (test_opt(sb, USRQUOTA))
587 seq_puts(seq, ",usrquota");
588
589 if (test_opt(sb, GRPQUOTA))
590 seq_puts(seq, ",grpquota");
591#endif
592}
593
594static char *data_mode_string(unsigned long mode)
595{
596 switch (mode) {
597 case EXT3_MOUNT_JOURNAL_DATA:
598 return "journal";
599 case EXT3_MOUNT_ORDERED_DATA:
600 return "ordered";
601 case EXT3_MOUNT_WRITEBACK_DATA:
602 return "writeback";
603 }
604 return "unknown";
605}
606
607/*
608 * Show an option if
609 * - it's set to a non-default value OR
610 * - if the per-sb default is different from the global default
611 */
612static int ext3_show_options(struct seq_file *seq, struct dentry *root)
613{
614 struct super_block *sb = root->d_sb;
615 struct ext3_sb_info *sbi = EXT3_SB(sb);
616 struct ext3_super_block *es = sbi->s_es;
617 unsigned long def_mount_opts;
618
619 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
620
621 if (sbi->s_sb_block != 1)
622 seq_printf(seq, ",sb=%lu", sbi->s_sb_block);
623 if (test_opt(sb, MINIX_DF))
624 seq_puts(seq, ",minixdf");
625 if (test_opt(sb, GRPID))
626 seq_puts(seq, ",grpid");
627 if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS))
628 seq_puts(seq, ",nogrpid");
629 if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT3_DEF_RESUID)) ||
630 le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) {
631 seq_printf(seq, ",resuid=%u",
632 from_kuid_munged(&init_user_ns, sbi->s_resuid));
633 }
634 if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT3_DEF_RESGID)) ||
635 le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) {
636 seq_printf(seq, ",resgid=%u",
637 from_kgid_munged(&init_user_ns, sbi->s_resgid));
638 }
639 if (test_opt(sb, ERRORS_RO)) {
640 int def_errors = le16_to_cpu(es->s_errors);
641
642 if (def_errors == EXT3_ERRORS_PANIC ||
643 def_errors == EXT3_ERRORS_CONTINUE) {
644 seq_puts(seq, ",errors=remount-ro");
645 }
646 }
647 if (test_opt(sb, ERRORS_CONT))
648 seq_puts(seq, ",errors=continue");
649 if (test_opt(sb, ERRORS_PANIC))
650 seq_puts(seq, ",errors=panic");
651 if (test_opt(sb, NO_UID32))
652 seq_puts(seq, ",nouid32");
653 if (test_opt(sb, DEBUG))
654 seq_puts(seq, ",debug");
655#ifdef CONFIG_EXT3_FS_XATTR
656 if (test_opt(sb, XATTR_USER))
657 seq_puts(seq, ",user_xattr");
658 if (!test_opt(sb, XATTR_USER) &&
659 (def_mount_opts & EXT3_DEFM_XATTR_USER)) {
660 seq_puts(seq, ",nouser_xattr");
661 }
662#endif
663#ifdef CONFIG_EXT3_FS_POSIX_ACL
664 if (test_opt(sb, POSIX_ACL))
665 seq_puts(seq, ",acl");
666 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT3_DEFM_ACL))
667 seq_puts(seq, ",noacl");
668#endif
669 if (!test_opt(sb, RESERVATION))
670 seq_puts(seq, ",noreservation");
671 if (sbi->s_commit_interval) {
672 seq_printf(seq, ",commit=%u",
673 (unsigned) (sbi->s_commit_interval / HZ));
674 }
675
676 /*
677 * Always display barrier state so it's clear what the status is.
678 */
679 seq_puts(seq, ",barrier=");
680 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
681 seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
682 if (test_opt(sb, DATA_ERR_ABORT))
683 seq_puts(seq, ",data_err=abort");
684
685 if (test_opt(sb, NOLOAD))
686 seq_puts(seq, ",norecovery");
687
688 ext3_show_quota_options(seq, sb);
689
690 return 0;
691}
692
693
694static struct inode *ext3_nfs_get_inode(struct super_block *sb,
695 u64 ino, u32 generation)
696{
697 struct inode *inode;
698
699 if (ino < EXT3_FIRST_INO(sb) && ino != EXT3_ROOT_INO)
700 return ERR_PTR(-ESTALE);
701 if (ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count))
702 return ERR_PTR(-ESTALE);
703
704 /* iget isn't really right if the inode is currently unallocated!!
705 *
706 * ext3_read_inode will return a bad_inode if the inode had been
707 * deleted, so we should be safe.
708 *
709 * Currently we don't know the generation for parent directory, so
710 * a generation of 0 means "accept any"
711 */
712 inode = ext3_iget(sb, ino);
713 if (IS_ERR(inode))
714 return ERR_CAST(inode);
715 if (generation && inode->i_generation != generation) {
716 iput(inode);
717 return ERR_PTR(-ESTALE);
718 }
719
720 return inode;
721}
722
723static struct dentry *ext3_fh_to_dentry(struct super_block *sb, struct fid *fid,
724 int fh_len, int fh_type)
725{
726 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
727 ext3_nfs_get_inode);
728}
729
730static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
731 int fh_len, int fh_type)
732{
733 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
734 ext3_nfs_get_inode);
735}
736
737/*
738 * Try to release metadata pages (indirect blocks, directories) which are
739 * mapped via the block device. Since these pages could have journal heads
740 * which would prevent try_to_free_buffers() from freeing them, we must use
741 * jbd layer's try_to_free_buffers() function to release them.
742 */
743static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
744 gfp_t wait)
745{
746 journal_t *journal = EXT3_SB(sb)->s_journal;
747
748 WARN_ON(PageChecked(page));
749 if (!page_has_buffers(page))
750 return 0;
751 if (journal)
752 return journal_try_to_free_buffers(journal, page,
753 wait & ~__GFP_WAIT);
754 return try_to_free_buffers(page);
755}
756
757#ifdef CONFIG_QUOTA
758#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
759#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
760
761static int ext3_write_dquot(struct dquot *dquot);
762static int ext3_acquire_dquot(struct dquot *dquot);
763static int ext3_release_dquot(struct dquot *dquot);
764static int ext3_mark_dquot_dirty(struct dquot *dquot);
765static int ext3_write_info(struct super_block *sb, int type);
766static int ext3_quota_on(struct super_block *sb, int type, int format_id,
767 struct path *path);
768static int ext3_quota_on_mount(struct super_block *sb, int type);
769static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
770 size_t len, loff_t off);
771static ssize_t ext3_quota_write(struct super_block *sb, int type,
772 const char *data, size_t len, loff_t off);
773static struct dquot **ext3_get_dquots(struct inode *inode)
774{
775 return EXT3_I(inode)->i_dquot;
776}
777
778static const struct dquot_operations ext3_quota_operations = {
779 .write_dquot = ext3_write_dquot,
780 .acquire_dquot = ext3_acquire_dquot,
781 .release_dquot = ext3_release_dquot,
782 .mark_dirty = ext3_mark_dquot_dirty,
783 .write_info = ext3_write_info,
784 .alloc_dquot = dquot_alloc,
785 .destroy_dquot = dquot_destroy,
786};
787
788static const struct quotactl_ops ext3_qctl_operations = {
789 .quota_on = ext3_quota_on,
790 .quota_off = dquot_quota_off,
791 .quota_sync = dquot_quota_sync,
792 .get_state = dquot_get_state,
793 .set_info = dquot_set_dqinfo,
794 .get_dqblk = dquot_get_dqblk,
795 .set_dqblk = dquot_set_dqblk
796};
797#endif
798
799static const struct super_operations ext3_sops = {
800 .alloc_inode = ext3_alloc_inode,
801 .destroy_inode = ext3_destroy_inode,
802 .write_inode = ext3_write_inode,
803 .dirty_inode = ext3_dirty_inode,
804 .drop_inode = ext3_drop_inode,
805 .evict_inode = ext3_evict_inode,
806 .put_super = ext3_put_super,
807 .sync_fs = ext3_sync_fs,
808 .freeze_fs = ext3_freeze,
809 .unfreeze_fs = ext3_unfreeze,
810 .statfs = ext3_statfs,
811 .remount_fs = ext3_remount,
812 .show_options = ext3_show_options,
813#ifdef CONFIG_QUOTA
814 .quota_read = ext3_quota_read,
815 .quota_write = ext3_quota_write,
816 .get_dquots = ext3_get_dquots,
817#endif
818 .bdev_try_to_free_page = bdev_try_to_free_page,
819};
820
821static const struct export_operations ext3_export_ops = {
822 .fh_to_dentry = ext3_fh_to_dentry,
823 .fh_to_parent = ext3_fh_to_parent,
824 .get_parent = ext3_get_parent,
825};
826
827enum {
828 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
829 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
830 Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
831 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
832 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
833 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
834 Opt_journal_path,
835 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
836 Opt_data_err_abort, Opt_data_err_ignore,
837 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
838 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
839 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
840 Opt_resize, Opt_usrquota, Opt_grpquota
841};
842
843static const match_table_t tokens = {
844 {Opt_bsd_df, "bsddf"},
845 {Opt_minix_df, "minixdf"},
846 {Opt_grpid, "grpid"},
847 {Opt_grpid, "bsdgroups"},
848 {Opt_nogrpid, "nogrpid"},
849 {Opt_nogrpid, "sysvgroups"},
850 {Opt_resgid, "resgid=%u"},
851 {Opt_resuid, "resuid=%u"},
852 {Opt_sb, "sb=%u"},
853 {Opt_err_cont, "errors=continue"},
854 {Opt_err_panic, "errors=panic"},
855 {Opt_err_ro, "errors=remount-ro"},
856 {Opt_nouid32, "nouid32"},
857 {Opt_nocheck, "nocheck"},
858 {Opt_nocheck, "check=none"},
859 {Opt_debug, "debug"},
860 {Opt_oldalloc, "oldalloc"},
861 {Opt_orlov, "orlov"},
862 {Opt_user_xattr, "user_xattr"},
863 {Opt_nouser_xattr, "nouser_xattr"},
864 {Opt_acl, "acl"},
865 {Opt_noacl, "noacl"},
866 {Opt_reservation, "reservation"},
867 {Opt_noreservation, "noreservation"},
868 {Opt_noload, "noload"},
869 {Opt_noload, "norecovery"},
870 {Opt_nobh, "nobh"},
871 {Opt_bh, "bh"},
872 {Opt_commit, "commit=%u"},
873 {Opt_journal_update, "journal=update"},
874 {Opt_journal_inum, "journal=%u"},
875 {Opt_journal_dev, "journal_dev=%u"},
876 {Opt_journal_path, "journal_path=%s"},
877 {Opt_abort, "abort"},
878 {Opt_data_journal, "data=journal"},
879 {Opt_data_ordered, "data=ordered"},
880 {Opt_data_writeback, "data=writeback"},
881 {Opt_data_err_abort, "data_err=abort"},
882 {Opt_data_err_ignore, "data_err=ignore"},
883 {Opt_offusrjquota, "usrjquota="},
884 {Opt_usrjquota, "usrjquota=%s"},
885 {Opt_offgrpjquota, "grpjquota="},
886 {Opt_grpjquota, "grpjquota=%s"},
887 {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
888 {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
889 {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
890 {Opt_grpquota, "grpquota"},
891 {Opt_noquota, "noquota"},
892 {Opt_quota, "quota"},
893 {Opt_usrquota, "usrquota"},
894 {Opt_barrier, "barrier=%u"},
895 {Opt_barrier, "barrier"},
896 {Opt_nobarrier, "nobarrier"},
897 {Opt_resize, "resize"},
898 {Opt_err, NULL},
899};
900
901static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
902{
903 ext3_fsblk_t sb_block;
904 char *options = (char *) *data;
905
906 if (!options || strncmp(options, "sb=", 3) != 0)
907 return 1; /* Default location */
908 options += 3;
909 /*todo: use simple_strtoll with >32bit ext3 */
910 sb_block = simple_strtoul(options, &options, 0);
911 if (*options && *options != ',') {
912 ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s",
913 (char *) *data);
914 return 1;
915 }
916 if (*options == ',')
917 options++;
918 *data = (void *) options;
919 return sb_block;
920}
921
922#ifdef CONFIG_QUOTA
923static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
924{
925 struct ext3_sb_info *sbi = EXT3_SB(sb);
926 char *qname;
927
928 if (sb_any_quota_loaded(sb) &&
929 !sbi->s_qf_names[qtype]) {
930 ext3_msg(sb, KERN_ERR,
931 "Cannot change journaled "
932 "quota options when quota turned on");
933 return 0;
934 }
935 qname = match_strdup(args);
936 if (!qname) {
937 ext3_msg(sb, KERN_ERR,
938 "Not enough memory for storing quotafile name");
939 return 0;
940 }
941 if (sbi->s_qf_names[qtype]) {
942 int same = !strcmp(sbi->s_qf_names[qtype], qname);
943
944 kfree(qname);
945 if (!same) {
946 ext3_msg(sb, KERN_ERR,
947 "%s quota file already specified",
948 QTYPE2NAME(qtype));
949 }
950 return same;
951 }
952 if (strchr(qname, '/')) {
953 ext3_msg(sb, KERN_ERR,
954 "quotafile must be on filesystem root");
955 kfree(qname);
956 return 0;
957 }
958 sbi->s_qf_names[qtype] = qname;
959 set_opt(sbi->s_mount_opt, QUOTA);
960 return 1;
961}
962
963static int clear_qf_name(struct super_block *sb, int qtype) {
964
965 struct ext3_sb_info *sbi = EXT3_SB(sb);
966
967 if (sb_any_quota_loaded(sb) &&
968 sbi->s_qf_names[qtype]) {
969 ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options"
970 " when quota turned on");
971 return 0;
972 }
973 if (sbi->s_qf_names[qtype]) {
974 kfree(sbi->s_qf_names[qtype]);
975 sbi->s_qf_names[qtype] = NULL;
976 }
977 return 1;
978}
979#endif
980
981static int parse_options (char *options, struct super_block *sb,
982 unsigned int *inum, unsigned long *journal_devnum,
983 ext3_fsblk_t *n_blocks_count, int is_remount)
984{
985 struct ext3_sb_info *sbi = EXT3_SB(sb);
986 char * p;
987 substring_t args[MAX_OPT_ARGS];
988 int data_opt = 0;
989 int option;
990 kuid_t uid;
991 kgid_t gid;
992 char *journal_path;
993 struct inode *journal_inode;
994 struct path path;
995 int error;
996
997#ifdef CONFIG_QUOTA
998 int qfmt;
999#endif
1000
1001 if (!options)
1002 return 1;
1003
1004 while ((p = strsep (&options, ",")) != NULL) {
1005 int token;
1006 if (!*p)
1007 continue;
1008 /*
1009 * Initialize args struct so we know whether arg was
1010 * found; some options take optional arguments.
1011 */
1012 args[0].to = args[0].from = NULL;
1013 token = match_token(p, tokens, args);
1014 switch (token) {
1015 case Opt_bsd_df:
1016 clear_opt (sbi->s_mount_opt, MINIX_DF);
1017 break;
1018 case Opt_minix_df:
1019 set_opt (sbi->s_mount_opt, MINIX_DF);
1020 break;
1021 case Opt_grpid:
1022 set_opt (sbi->s_mount_opt, GRPID);
1023 break;
1024 case Opt_nogrpid:
1025 clear_opt (sbi->s_mount_opt, GRPID);
1026 break;
1027 case Opt_resuid:
1028 if (match_int(&args[0], &option))
1029 return 0;
1030 uid = make_kuid(current_user_ns(), option);
1031 if (!uid_valid(uid)) {
1032 ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option);
1033 return 0;
1034
1035 }
1036 sbi->s_resuid = uid;
1037 break;
1038 case Opt_resgid:
1039 if (match_int(&args[0], &option))
1040 return 0;
1041 gid = make_kgid(current_user_ns(), option);
1042 if (!gid_valid(gid)) {
1043 ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option);
1044 return 0;
1045 }
1046 sbi->s_resgid = gid;
1047 break;
1048 case Opt_sb:
1049 /* handled by get_sb_block() instead of here */
1050 /* *sb_block = match_int(&args[0]); */
1051 break;
1052 case Opt_err_panic:
1053 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
1054 clear_opt (sbi->s_mount_opt, ERRORS_RO);
1055 set_opt (sbi->s_mount_opt, ERRORS_PANIC);
1056 break;
1057 case Opt_err_ro:
1058 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
1059 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
1060 set_opt (sbi->s_mount_opt, ERRORS_RO);
1061 break;
1062 case Opt_err_cont:
1063 clear_opt (sbi->s_mount_opt, ERRORS_RO);
1064 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
1065 set_opt (sbi->s_mount_opt, ERRORS_CONT);
1066 break;
1067 case Opt_nouid32:
1068 set_opt (sbi->s_mount_opt, NO_UID32);
1069 break;
1070 case Opt_nocheck:
1071 clear_opt (sbi->s_mount_opt, CHECK);
1072 break;
1073 case Opt_debug:
1074 set_opt (sbi->s_mount_opt, DEBUG);
1075 break;
1076 case Opt_oldalloc:
1077 ext3_msg(sb, KERN_WARNING,
1078 "Ignoring deprecated oldalloc option");
1079 break;
1080 case Opt_orlov:
1081 ext3_msg(sb, KERN_WARNING,
1082 "Ignoring deprecated orlov option");
1083 break;
1084#ifdef CONFIG_EXT3_FS_XATTR
1085 case Opt_user_xattr:
1086 set_opt (sbi->s_mount_opt, XATTR_USER);
1087 break;
1088 case Opt_nouser_xattr:
1089 clear_opt (sbi->s_mount_opt, XATTR_USER);
1090 break;
1091#else
1092 case Opt_user_xattr:
1093 case Opt_nouser_xattr:
1094 ext3_msg(sb, KERN_INFO,
1095 "(no)user_xattr options not supported");
1096 break;
1097#endif
1098#ifdef CONFIG_EXT3_FS_POSIX_ACL
1099 case Opt_acl:
1100 set_opt(sbi->s_mount_opt, POSIX_ACL);
1101 break;
1102 case Opt_noacl:
1103 clear_opt(sbi->s_mount_opt, POSIX_ACL);
1104 break;
1105#else
1106 case Opt_acl:
1107 case Opt_noacl:
1108 ext3_msg(sb, KERN_INFO,
1109 "(no)acl options not supported");
1110 break;
1111#endif
1112 case Opt_reservation:
1113 set_opt(sbi->s_mount_opt, RESERVATION);
1114 break;
1115 case Opt_noreservation:
1116 clear_opt(sbi->s_mount_opt, RESERVATION);
1117 break;
1118 case Opt_journal_update:
1119 /* @@@ FIXME */
1120 /* Eventually we will want to be able to create
1121 a journal file here. For now, only allow the
1122 user to specify an existing inode to be the
1123 journal file. */
1124 if (is_remount) {
1125 ext3_msg(sb, KERN_ERR, "error: cannot specify "
1126 "journal on remount");
1127 return 0;
1128 }
1129 set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
1130 break;
1131 case Opt_journal_inum:
1132 if (is_remount) {
1133 ext3_msg(sb, KERN_ERR, "error: cannot specify "
1134 "journal on remount");
1135 return 0;
1136 }
1137 if (match_int(&args[0], &option))
1138 return 0;
1139 *inum = option;
1140 break;
1141 case Opt_journal_dev:
1142 if (is_remount) {
1143 ext3_msg(sb, KERN_ERR, "error: cannot specify "
1144 "journal on remount");
1145 return 0;
1146 }
1147 if (match_int(&args[0], &option))
1148 return 0;
1149 *journal_devnum = option;
1150 break;
1151 case Opt_journal_path:
1152 if (is_remount) {
1153 ext3_msg(sb, KERN_ERR, "error: cannot specify "
1154 "journal on remount");
1155 return 0;
1156 }
1157
1158 journal_path = match_strdup(&args[0]);
1159 if (!journal_path) {
1160 ext3_msg(sb, KERN_ERR, "error: could not dup "
1161 "journal device string");
1162 return 0;
1163 }
1164
1165 error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
1166 if (error) {
1167 ext3_msg(sb, KERN_ERR, "error: could not find "
1168 "journal device path: error %d", error);
1169 kfree(journal_path);
1170 return 0;
1171 }
1172
1173 journal_inode = d_inode(path.dentry);
1174 if (!S_ISBLK(journal_inode->i_mode)) {
1175 ext3_msg(sb, KERN_ERR, "error: journal path %s "
1176 "is not a block device", journal_path);
1177 path_put(&path);
1178 kfree(journal_path);
1179 return 0;
1180 }
1181
1182 *journal_devnum = new_encode_dev(journal_inode->i_rdev);
1183 path_put(&path);
1184 kfree(journal_path);
1185 break;
1186 case Opt_noload:
1187 set_opt (sbi->s_mount_opt, NOLOAD);
1188 break;
1189 case Opt_commit:
1190 if (match_int(&args[0], &option))
1191 return 0;
1192 if (option < 0)
1193 return 0;
1194 if (option == 0)
1195 option = JBD_DEFAULT_MAX_COMMIT_AGE;
1196 sbi->s_commit_interval = HZ * option;
1197 break;
1198 case Opt_data_journal:
1199 data_opt = EXT3_MOUNT_JOURNAL_DATA;
1200 goto datacheck;
1201 case Opt_data_ordered:
1202 data_opt = EXT3_MOUNT_ORDERED_DATA;
1203 goto datacheck;
1204 case Opt_data_writeback:
1205 data_opt = EXT3_MOUNT_WRITEBACK_DATA;
1206 datacheck:
1207 if (is_remount) {
1208 if (test_opt(sb, DATA_FLAGS) == data_opt)
1209 break;
1210 ext3_msg(sb, KERN_ERR,
1211 "error: cannot change "
1212 "data mode on remount. The filesystem "
1213 "is mounted in data=%s mode and you "
1214 "try to remount it in data=%s mode.",
1215 data_mode_string(test_opt(sb,
1216 DATA_FLAGS)),
1217 data_mode_string(data_opt));
1218 return 0;
1219 } else {
1220 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
1221 sbi->s_mount_opt |= data_opt;
1222 }
1223 break;
1224 case Opt_data_err_abort:
1225 set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1226 break;
1227 case Opt_data_err_ignore:
1228 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1229 break;
1230#ifdef CONFIG_QUOTA
1231 case Opt_usrjquota:
1232 if (!set_qf_name(sb, USRQUOTA, &args[0]))
1233 return 0;
1234 break;
1235 case Opt_grpjquota:
1236 if (!set_qf_name(sb, GRPQUOTA, &args[0]))
1237 return 0;
1238 break;
1239 case Opt_offusrjquota:
1240 if (!clear_qf_name(sb, USRQUOTA))
1241 return 0;
1242 break;
1243 case Opt_offgrpjquota:
1244 if (!clear_qf_name(sb, GRPQUOTA))
1245 return 0;
1246 break;
1247 case Opt_jqfmt_vfsold:
1248 qfmt = QFMT_VFS_OLD;
1249 goto set_qf_format;
1250 case Opt_jqfmt_vfsv0:
1251 qfmt = QFMT_VFS_V0;
1252 goto set_qf_format;
1253 case Opt_jqfmt_vfsv1:
1254 qfmt = QFMT_VFS_V1;
1255set_qf_format:
1256 if (sb_any_quota_loaded(sb) &&
1257 sbi->s_jquota_fmt != qfmt) {
1258 ext3_msg(sb, KERN_ERR, "error: cannot change "
1259 "journaled quota options when "
1260 "quota turned on.");
1261 return 0;
1262 }
1263 sbi->s_jquota_fmt = qfmt;
1264 break;
1265 case Opt_quota:
1266 case Opt_usrquota:
1267 set_opt(sbi->s_mount_opt, QUOTA);
1268 set_opt(sbi->s_mount_opt, USRQUOTA);
1269 break;
1270 case Opt_grpquota:
1271 set_opt(sbi->s_mount_opt, QUOTA);
1272 set_opt(sbi->s_mount_opt, GRPQUOTA);
1273 break;
1274 case Opt_noquota:
1275 if (sb_any_quota_loaded(sb)) {
1276 ext3_msg(sb, KERN_ERR, "error: cannot change "
1277 "quota options when quota turned on.");
1278 return 0;
1279 }
1280 clear_opt(sbi->s_mount_opt, QUOTA);
1281 clear_opt(sbi->s_mount_opt, USRQUOTA);
1282 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1283 break;
1284#else
1285 case Opt_quota:
1286 case Opt_usrquota:
1287 case Opt_grpquota:
1288 ext3_msg(sb, KERN_ERR,
1289 "error: quota options not supported.");
1290 break;
1291 case Opt_usrjquota:
1292 case Opt_grpjquota:
1293 case Opt_offusrjquota:
1294 case Opt_offgrpjquota:
1295 case Opt_jqfmt_vfsold:
1296 case Opt_jqfmt_vfsv0:
1297 case Opt_jqfmt_vfsv1:
1298 ext3_msg(sb, KERN_ERR,
1299 "error: journaled quota options not "
1300 "supported.");
1301 break;
1302 case Opt_noquota:
1303 break;
1304#endif
1305 case Opt_abort:
1306 set_opt(sbi->s_mount_opt, ABORT);
1307 break;
1308 case Opt_nobarrier:
1309 clear_opt(sbi->s_mount_opt, BARRIER);
1310 break;
1311 case Opt_barrier:
1312 if (args[0].from) {
1313 if (match_int(&args[0], &option))
1314 return 0;
1315 } else
1316 option = 1; /* No argument, default to 1 */
1317 if (option)
1318 set_opt(sbi->s_mount_opt, BARRIER);
1319 else
1320 clear_opt(sbi->s_mount_opt, BARRIER);
1321 break;
1322 case Opt_ignore:
1323 break;
1324 case Opt_resize:
1325 if (!is_remount) {
1326 ext3_msg(sb, KERN_ERR,
1327 "error: resize option only available "
1328 "for remount");
1329 return 0;
1330 }
1331 if (match_int(&args[0], &option) != 0)
1332 return 0;
1333 *n_blocks_count = option;
1334 break;
1335 case Opt_nobh:
1336 ext3_msg(sb, KERN_WARNING,
1337 "warning: ignoring deprecated nobh option");
1338 break;
1339 case Opt_bh:
1340 ext3_msg(sb, KERN_WARNING,
1341 "warning: ignoring deprecated bh option");
1342 break;
1343 default:
1344 ext3_msg(sb, KERN_ERR,
1345 "error: unrecognized mount option \"%s\" "
1346 "or missing value", p);
1347 return 0;
1348 }
1349 }
1350#ifdef CONFIG_QUOTA
1351 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1352 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1353 clear_opt(sbi->s_mount_opt, USRQUOTA);
1354 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1355 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1356
1357 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1358 ext3_msg(sb, KERN_ERR, "error: old and new quota "
1359 "format mixing.");
1360 return 0;
1361 }
1362
1363 if (!sbi->s_jquota_fmt) {
1364 ext3_msg(sb, KERN_ERR, "error: journaled quota format "
1365 "not specified.");
1366 return 0;
1367 }
1368 }
1369#endif
1370 return 1;
1371}
1372
1373static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
1374 int read_only)
1375{
1376 struct ext3_sb_info *sbi = EXT3_SB(sb);
1377 int res = 0;
1378
1379 if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
1380 ext3_msg(sb, KERN_ERR,
1381 "error: revision level too high, "
1382 "forcing read-only mode");
1383 res = MS_RDONLY;
1384 }
1385 if (read_only)
1386 return res;
1387 if (!(sbi->s_mount_state & EXT3_VALID_FS))
1388 ext3_msg(sb, KERN_WARNING,
1389 "warning: mounting unchecked fs, "
1390 "running e2fsck is recommended");
1391 else if ((sbi->s_mount_state & EXT3_ERROR_FS))
1392 ext3_msg(sb, KERN_WARNING,
1393 "warning: mounting fs with errors, "
1394 "running e2fsck is recommended");
1395 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
1396 le16_to_cpu(es->s_mnt_count) >=
1397 le16_to_cpu(es->s_max_mnt_count))
1398 ext3_msg(sb, KERN_WARNING,
1399 "warning: maximal mount count reached, "
1400 "running e2fsck is recommended");
1401 else if (le32_to_cpu(es->s_checkinterval) &&
1402 (le32_to_cpu(es->s_lastcheck) +
1403 le32_to_cpu(es->s_checkinterval) <= get_seconds()))
1404 ext3_msg(sb, KERN_WARNING,
1405 "warning: checktime reached, "
1406 "running e2fsck is recommended");
1407#if 0
1408 /* @@@ We _will_ want to clear the valid bit if we find
1409 inconsistencies, to force a fsck at reboot. But for
1410 a plain journaled filesystem we can keep it set as
1411 valid forever! :) */
1412 es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
1413#endif
1414 if (!le16_to_cpu(es->s_max_mnt_count))
1415 es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
1416 le16_add_cpu(&es->s_mnt_count, 1);
1417 es->s_mtime = cpu_to_le32(get_seconds());
1418 ext3_update_dynamic_rev(sb);
1419 EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
1420
1421 ext3_commit_super(sb, es, 1);
1422 if (test_opt(sb, DEBUG))
1423 ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, "
1424 "bpg=%lu, ipg=%lu, mo=%04lx]",
1425 sb->s_blocksize,
1426 sbi->s_groups_count,
1427 EXT3_BLOCKS_PER_GROUP(sb),
1428 EXT3_INODES_PER_GROUP(sb),
1429 sbi->s_mount_opt);
1430
1431 if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
1432 char b[BDEVNAME_SIZE];
1433 ext3_msg(sb, KERN_INFO, "using external journal on %s",
1434 bdevname(EXT3_SB(sb)->s_journal->j_dev, b));
1435 } else {
1436 ext3_msg(sb, KERN_INFO, "using internal journal");
1437 }
1438 cleancache_init_fs(sb);
1439 return res;
1440}
1441
1442/* Called at mount-time, super-block is locked */
1443static int ext3_check_descriptors(struct super_block *sb)
1444{
1445 struct ext3_sb_info *sbi = EXT3_SB(sb);
1446 int i;
1447
1448 ext3_debug ("Checking group descriptors");
1449
1450 for (i = 0; i < sbi->s_groups_count; i++) {
1451 struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL);
1452 ext3_fsblk_t first_block = ext3_group_first_block_no(sb, i);
1453 ext3_fsblk_t last_block;
1454
1455 if (i == sbi->s_groups_count - 1)
1456 last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
1457 else
1458 last_block = first_block +
1459 (EXT3_BLOCKS_PER_GROUP(sb) - 1);
1460
1461 if (le32_to_cpu(gdp->bg_block_bitmap) < first_block ||
1462 le32_to_cpu(gdp->bg_block_bitmap) > last_block)
1463 {
1464 ext3_error (sb, "ext3_check_descriptors",
1465 "Block bitmap for group %d"
1466 " not in group (block %lu)!",
1467 i, (unsigned long)
1468 le32_to_cpu(gdp->bg_block_bitmap));
1469 return 0;
1470 }
1471 if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block ||
1472 le32_to_cpu(gdp->bg_inode_bitmap) > last_block)
1473 {
1474 ext3_error (sb, "ext3_check_descriptors",
1475 "Inode bitmap for group %d"
1476 " not in group (block %lu)!",
1477 i, (unsigned long)
1478 le32_to_cpu(gdp->bg_inode_bitmap));
1479 return 0;
1480 }
1481 if (le32_to_cpu(gdp->bg_inode_table) < first_block ||
1482 le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group - 1 >
1483 last_block)
1484 {
1485 ext3_error (sb, "ext3_check_descriptors",
1486 "Inode table for group %d"
1487 " not in group (block %lu)!",
1488 i, (unsigned long)
1489 le32_to_cpu(gdp->bg_inode_table));
1490 return 0;
1491 }
1492 }
1493
1494 sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb));
1495 sbi->s_es->s_free_inodes_count=cpu_to_le32(ext3_count_free_inodes(sb));
1496 return 1;
1497}
1498
1499
1500/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at
1501 * the superblock) which were deleted from all directories, but held open by
1502 * a process at the time of a crash. We walk the list and try to delete these
1503 * inodes at recovery time (only with a read-write filesystem).
1504 *
1505 * In order to keep the orphan inode chain consistent during traversal (in
1506 * case of crash during recovery), we link each inode into the superblock
1507 * orphan list_head and handle it the same way as an inode deletion during
1508 * normal operation (which journals the operations for us).
1509 *
1510 * We only do an iget() and an iput() on each inode, which is very safe if we
1511 * accidentally point at an in-use or already deleted inode. The worst that
1512 * can happen in this case is that we get a "bit already cleared" message from
1513 * ext3_free_inode(). The only reason we would point at a wrong inode is if
1514 * e2fsck was run on this filesystem, and it must have already done the orphan
1515 * inode cleanup for us, so we can safely abort without any further action.
1516 */
1517static void ext3_orphan_cleanup (struct super_block * sb,
1518 struct ext3_super_block * es)
1519{
1520 unsigned int s_flags = sb->s_flags;
1521 int nr_orphans = 0, nr_truncates = 0;
1522#ifdef CONFIG_QUOTA
1523 int i;
1524#endif
1525 if (!es->s_last_orphan) {
1526 jbd_debug(4, "no orphan inodes to clean up\n");
1527 return;
1528 }
1529
1530 if (bdev_read_only(sb->s_bdev)) {
1531 ext3_msg(sb, KERN_ERR, "error: write access "
1532 "unavailable, skipping orphan cleanup.");
1533 return;
1534 }
1535
1536 /* Check if feature set allows readwrite operations */
1537 if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) {
1538 ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
1539 "unknown ROCOMPAT features");
1540 return;
1541 }
1542
1543 if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
1544 /* don't clear list on RO mount w/ errors */
1545 if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
1546 jbd_debug(1, "Errors on filesystem, "
1547 "clearing orphan list.\n");
1548 es->s_last_orphan = 0;
1549 }
1550 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
1551 return;
1552 }
1553
1554 if (s_flags & MS_RDONLY) {
1555 ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
1556 sb->s_flags &= ~MS_RDONLY;
1557 }
1558#ifdef CONFIG_QUOTA
1559 /* Needed for iput() to work correctly and not trash data */
1560 sb->s_flags |= MS_ACTIVE;
1561 /* Turn on quotas so that they are updated correctly */
1562 for (i = 0; i < EXT3_MAXQUOTAS; i++) {
1563 if (EXT3_SB(sb)->s_qf_names[i]) {
1564 int ret = ext3_quota_on_mount(sb, i);
1565 if (ret < 0)
1566 ext3_msg(sb, KERN_ERR,
1567 "error: cannot turn on journaled "
1568 "quota: %d", ret);
1569 }
1570 }
1571#endif
1572
1573 while (es->s_last_orphan) {
1574 struct inode *inode;
1575
1576 inode = ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
1577 if (IS_ERR(inode)) {
1578 es->s_last_orphan = 0;
1579 break;
1580 }
1581
1582 list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
1583 dquot_initialize(inode);
1584 if (inode->i_nlink) {
1585 printk(KERN_DEBUG
1586 "%s: truncating inode %lu to %Ld bytes\n",
1587 __func__, inode->i_ino, inode->i_size);
1588 jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
1589 inode->i_ino, inode->i_size);
1590 ext3_truncate(inode);
1591 nr_truncates++;
1592 } else {
1593 printk(KERN_DEBUG
1594 "%s: deleting unreferenced inode %lu\n",
1595 __func__, inode->i_ino);
1596 jbd_debug(2, "deleting unreferenced inode %lu\n",
1597 inode->i_ino);
1598 nr_orphans++;
1599 }
1600 iput(inode); /* The delete magic happens here! */
1601 }
1602
1603#define PLURAL(x) (x), ((x)==1) ? "" : "s"
1604
1605 if (nr_orphans)
1606 ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
1607 PLURAL(nr_orphans));
1608 if (nr_truncates)
1609 ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
1610 PLURAL(nr_truncates));
1611#ifdef CONFIG_QUOTA
1612 /* Turn quotas off */
1613 for (i = 0; i < EXT3_MAXQUOTAS; i++) {
1614 if (sb_dqopt(sb)->files[i])
1615 dquot_quota_off(sb, i);
1616 }
1617#endif
1618 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
1619}
1620
1621/*
1622 * Maximal file size. There is a direct, and {,double-,triple-}indirect
1623 * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
1624 * We need to be 1 filesystem block less than the 2^32 sector limit.
1625 */
1626static loff_t ext3_max_size(int bits)
1627{
1628 loff_t res = EXT3_NDIR_BLOCKS;
1629 int meta_blocks;
1630 loff_t upper_limit;
1631
1632 /* This is calculated to be the largest file size for a
1633 * dense, file such that the total number of
1634 * sectors in the file, including data and all indirect blocks,
1635 * does not exceed 2^32 -1
1636 * __u32 i_blocks representing the total number of
1637 * 512 bytes blocks of the file
1638 */
1639 upper_limit = (1LL << 32) - 1;
1640
1641 /* total blocks in file system block size */
1642 upper_limit >>= (bits - 9);
1643
1644
1645 /* indirect blocks */
1646 meta_blocks = 1;
1647 /* double indirect blocks */
1648 meta_blocks += 1 + (1LL << (bits-2));
1649 /* tripple indirect blocks */
1650 meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
1651
1652 upper_limit -= meta_blocks;
1653 upper_limit <<= bits;
1654
1655 res += 1LL << (bits-2);
1656 res += 1LL << (2*(bits-2));
1657 res += 1LL << (3*(bits-2));
1658 res <<= bits;
1659 if (res > upper_limit)
1660 res = upper_limit;
1661
1662 if (res > MAX_LFS_FILESIZE)
1663 res = MAX_LFS_FILESIZE;
1664
1665 return res;
1666}
1667
1668static ext3_fsblk_t descriptor_loc(struct super_block *sb,
1669 ext3_fsblk_t logic_sb_block,
1670 int nr)
1671{
1672 struct ext3_sb_info *sbi = EXT3_SB(sb);
1673 unsigned long bg, first_meta_bg;
1674 int has_super = 0;
1675
1676 first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
1677
1678 if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) ||
1679 nr < first_meta_bg)
1680 return (logic_sb_block + nr + 1);
1681 bg = sbi->s_desc_per_block * nr;
1682 if (ext3_bg_has_super(sb, bg))
1683 has_super = 1;
1684 return (has_super + ext3_group_first_block_no(sb, bg));
1685}
1686
1687
1688static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1689{
1690 struct buffer_head * bh;
1691 struct ext3_super_block *es = NULL;
1692 struct ext3_sb_info *sbi;
1693 ext3_fsblk_t block;
1694 ext3_fsblk_t sb_block = get_sb_block(&data, sb);
1695 ext3_fsblk_t logic_sb_block;
1696 unsigned long offset = 0;
1697 unsigned int journal_inum = 0;
1698 unsigned long journal_devnum = 0;
1699 unsigned long def_mount_opts;
1700 struct inode *root;
1701 int blocksize;
1702 int hblock;
1703 int db_count;
1704 int i;
1705 int needs_recovery;
1706 int ret = -EINVAL;
1707 __le32 features;
1708 int err;
1709
1710 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
1711 if (!sbi)
1712 return -ENOMEM;
1713
1714 sbi->s_blockgroup_lock =
1715 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
1716 if (!sbi->s_blockgroup_lock) {
1717 kfree(sbi);
1718 return -ENOMEM;
1719 }
1720 sb->s_fs_info = sbi;
1721 sbi->s_sb_block = sb_block;
1722
1723 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
1724 if (!blocksize) {
1725 ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
1726 goto out_fail;
1727 }
1728
1729 /*
1730 * The ext3 superblock will not be buffer aligned for other than 1kB
1731 * block sizes. We need to calculate the offset from buffer start.
1732 */
1733 if (blocksize != EXT3_MIN_BLOCK_SIZE) {
1734 logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
1735 offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
1736 } else {
1737 logic_sb_block = sb_block;
1738 }
1739
1740 if (!(bh = sb_bread(sb, logic_sb_block))) {
1741 ext3_msg(sb, KERN_ERR, "error: unable to read superblock");
1742 goto out_fail;
1743 }
1744 /*
1745 * Note: s_es must be initialized as soon as possible because
1746 * some ext3 macro-instructions depend on its value
1747 */
1748 es = (struct ext3_super_block *) (bh->b_data + offset);
1749 sbi->s_es = es;
1750 sb->s_magic = le16_to_cpu(es->s_magic);
1751 if (sb->s_magic != EXT3_SUPER_MAGIC)
1752 goto cantfind_ext3;
1753
1754 /* Set defaults before we parse the mount options */
1755 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
1756 if (def_mount_opts & EXT3_DEFM_DEBUG)
1757 set_opt(sbi->s_mount_opt, DEBUG);
1758 if (def_mount_opts & EXT3_DEFM_BSDGROUPS)
1759 set_opt(sbi->s_mount_opt, GRPID);
1760 if (def_mount_opts & EXT3_DEFM_UID16)
1761 set_opt(sbi->s_mount_opt, NO_UID32);
1762#ifdef CONFIG_EXT3_FS_XATTR
1763 if (def_mount_opts & EXT3_DEFM_XATTR_USER)
1764 set_opt(sbi->s_mount_opt, XATTR_USER);
1765#endif
1766#ifdef CONFIG_EXT3_FS_POSIX_ACL
1767 if (def_mount_opts & EXT3_DEFM_ACL)
1768 set_opt(sbi->s_mount_opt, POSIX_ACL);
1769#endif
1770 if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
1771 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
1772 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
1773 set_opt(sbi->s_mount_opt, ORDERED_DATA);
1774 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
1775 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
1776
1777 if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
1778 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
1779 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_CONTINUE)
1780 set_opt(sbi->s_mount_opt, ERRORS_CONT);
1781 else
1782 set_opt(sbi->s_mount_opt, ERRORS_RO);
1783
1784 sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
1785 sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
1786
1787 /* enable barriers by default */
1788 set_opt(sbi->s_mount_opt, BARRIER);
1789 set_opt(sbi->s_mount_opt, RESERVATION);
1790
1791 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
1792 NULL, 0))
1793 goto failed_mount;
1794
1795 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
1796 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
1797
1798 if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
1799 (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
1800 EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
1801 EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
1802 ext3_msg(sb, KERN_WARNING,
1803 "warning: feature flags set on rev 0 fs, "
1804 "running e2fsck is recommended");
1805 /*
1806 * Check feature flags regardless of the revision level, since we
1807 * previously didn't change the revision level when setting the flags,
1808 * so there is a chance incompat flags are set on a rev 0 filesystem.
1809 */
1810 features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP);
1811 if (features) {
1812 ext3_msg(sb, KERN_ERR,
1813 "error: couldn't mount because of unsupported "
1814 "optional features (%x)", le32_to_cpu(features));
1815 goto failed_mount;
1816 }
1817 features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP);
1818 if (!(sb->s_flags & MS_RDONLY) && features) {
1819 ext3_msg(sb, KERN_ERR,
1820 "error: couldn't mount RDWR because of unsupported "
1821 "optional features (%x)", le32_to_cpu(features));
1822 goto failed_mount;
1823 }
1824 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
1825
1826 if (blocksize < EXT3_MIN_BLOCK_SIZE ||
1827 blocksize > EXT3_MAX_BLOCK_SIZE) {
1828 ext3_msg(sb, KERN_ERR,
1829 "error: couldn't mount because of unsupported "
1830 "filesystem blocksize %d", blocksize);
1831 goto failed_mount;
1832 }
1833
1834 hblock = bdev_logical_block_size(sb->s_bdev);
1835 if (sb->s_blocksize != blocksize) {
1836 /*
1837 * Make sure the blocksize for the filesystem is larger
1838 * than the hardware sectorsize for the machine.
1839 */
1840 if (blocksize < hblock) {
1841 ext3_msg(sb, KERN_ERR,
1842 "error: fsblocksize %d too small for "
1843 "hardware sectorsize %d", blocksize, hblock);
1844 goto failed_mount;
1845 }
1846
1847 brelse (bh);
1848 if (!sb_set_blocksize(sb, blocksize)) {
1849 ext3_msg(sb, KERN_ERR,
1850 "error: bad blocksize %d", blocksize);
1851 goto out_fail;
1852 }
1853 logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
1854 offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
1855 bh = sb_bread(sb, logic_sb_block);
1856 if (!bh) {
1857 ext3_msg(sb, KERN_ERR,
1858 "error: can't read superblock on 2nd try");
1859 goto failed_mount;
1860 }
1861 es = (struct ext3_super_block *)(bh->b_data + offset);
1862 sbi->s_es = es;
1863 if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
1864 ext3_msg(sb, KERN_ERR,
1865 "error: magic mismatch");
1866 goto failed_mount;
1867 }
1868 }
1869
1870 sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits);
1871
1872 if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) {
1873 sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE;
1874 sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO;
1875 } else {
1876 sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
1877 sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
1878 if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
1879 (!is_power_of_2(sbi->s_inode_size)) ||
1880 (sbi->s_inode_size > blocksize)) {
1881 ext3_msg(sb, KERN_ERR,
1882 "error: unsupported inode size: %d",
1883 sbi->s_inode_size);
1884 goto failed_mount;
1885 }
1886 }
1887 sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
1888 le32_to_cpu(es->s_log_frag_size);
1889 if (blocksize != sbi->s_frag_size) {
1890 ext3_msg(sb, KERN_ERR,
1891 "error: fragsize %lu != blocksize %u (unsupported)",
1892 sbi->s_frag_size, blocksize);
1893 goto failed_mount;
1894 }
1895 sbi->s_frags_per_block = 1;
1896 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
1897 sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
1898 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
1899 if (EXT3_INODE_SIZE(sb) == 0 || EXT3_INODES_PER_GROUP(sb) == 0)
1900 goto cantfind_ext3;
1901 sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
1902 if (sbi->s_inodes_per_block == 0)
1903 goto cantfind_ext3;
1904 sbi->s_itb_per_group = sbi->s_inodes_per_group /
1905 sbi->s_inodes_per_block;
1906 sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
1907 sbi->s_sbh = bh;
1908 sbi->s_mount_state = le16_to_cpu(es->s_state);
1909 sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb));
1910 sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb));
1911 for (i = 0; i < 4; i++)
1912 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
1913 sbi->s_def_hash_version = es->s_def_hash_version;
1914 i = le32_to_cpu(es->s_flags);
1915 if (i & EXT2_FLAGS_UNSIGNED_HASH)
1916 sbi->s_hash_unsigned = 3;
1917 else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
1918#ifdef __CHAR_UNSIGNED__
1919 es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
1920 sbi->s_hash_unsigned = 3;
1921#else
1922 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
1923#endif
1924 }
1925
1926 if (sbi->s_blocks_per_group > blocksize * 8) {
1927 ext3_msg(sb, KERN_ERR,
1928 "#blocks per group too big: %lu",
1929 sbi->s_blocks_per_group);
1930 goto failed_mount;
1931 }
1932 if (sbi->s_frags_per_group > blocksize * 8) {
1933 ext3_msg(sb, KERN_ERR,
1934 "error: #fragments per group too big: %lu",
1935 sbi->s_frags_per_group);
1936 goto failed_mount;
1937 }
1938 if (sbi->s_inodes_per_group > blocksize * 8) {
1939 ext3_msg(sb, KERN_ERR,
1940 "error: #inodes per group too big: %lu",
1941 sbi->s_inodes_per_group);
1942 goto failed_mount;
1943 }
1944
1945 err = generic_check_addressable(sb->s_blocksize_bits,
1946 le32_to_cpu(es->s_blocks_count));
1947 if (err) {
1948 ext3_msg(sb, KERN_ERR,
1949 "error: filesystem is too large to mount safely");
1950 if (sizeof(sector_t) < 8)
1951 ext3_msg(sb, KERN_ERR,
1952 "error: CONFIG_LBDAF not enabled");
1953 ret = err;
1954 goto failed_mount;
1955 }
1956
1957 if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
1958 goto cantfind_ext3;
1959 sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
1960 le32_to_cpu(es->s_first_data_block) - 1)
1961 / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
1962 db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb));
1963 sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
1964 GFP_KERNEL);
1965 if (sbi->s_group_desc == NULL) {
1966 ext3_msg(sb, KERN_ERR,
1967 "error: not enough memory");
1968 ret = -ENOMEM;
1969 goto failed_mount;
1970 }
1971
1972 bgl_lock_init(sbi->s_blockgroup_lock);
1973
1974 for (i = 0; i < db_count; i++) {
1975 block = descriptor_loc(sb, logic_sb_block, i);
1976 sbi->s_group_desc[i] = sb_bread(sb, block);
1977 if (!sbi->s_group_desc[i]) {
1978 ext3_msg(sb, KERN_ERR,
1979 "error: can't read group descriptor %d", i);
1980 db_count = i;
1981 goto failed_mount2;
1982 }
1983 }
1984 if (!ext3_check_descriptors (sb)) {
1985 ext3_msg(sb, KERN_ERR,
1986 "error: group descriptors corrupted");
1987 goto failed_mount2;
1988 }
1989 sbi->s_gdb_count = db_count;
1990 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
1991 spin_lock_init(&sbi->s_next_gen_lock);
1992
1993 /* per fileystem reservation list head & lock */
1994 spin_lock_init(&sbi->s_rsv_window_lock);
1995 sbi->s_rsv_window_root = RB_ROOT;
1996 /* Add a single, static dummy reservation to the start of the
1997 * reservation window list --- it gives us a placeholder for
1998 * append-at-start-of-list which makes the allocation logic
1999 * _much_ simpler. */
2000 sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
2001 sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
2002 sbi->s_rsv_window_head.rsv_alloc_hit = 0;
2003 sbi->s_rsv_window_head.rsv_goal_size = 0;
2004 ext3_rsv_window_add(sb, &sbi->s_rsv_window_head);
2005
2006 /*
2007 * set up enough so that it can read an inode
2008 */
2009 sb->s_op = &ext3_sops;
2010 sb->s_export_op = &ext3_export_ops;
2011 sb->s_xattr = ext3_xattr_handlers;
2012#ifdef CONFIG_QUOTA
2013 sb->s_qcop = &ext3_qctl_operations;
2014 sb->dq_op = &ext3_quota_operations;
2015 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
2016#endif
2017 memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
2018 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
2019 mutex_init(&sbi->s_orphan_lock);
2020 mutex_init(&sbi->s_resize_lock);
2021
2022 sb->s_root = NULL;
2023
2024 needs_recovery = (es->s_last_orphan != 0 ||
2025 EXT3_HAS_INCOMPAT_FEATURE(sb,
2026 EXT3_FEATURE_INCOMPAT_RECOVER));
2027
2028 /*
2029 * The first inode we look at is the journal inode. Don't try
2030 * root first: it may be modified in the journal!
2031 */
2032 if (!test_opt(sb, NOLOAD) &&
2033 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
2034 if (ext3_load_journal(sb, es, journal_devnum))
2035 goto failed_mount2;
2036 } else if (journal_inum) {
2037 if (ext3_create_journal(sb, es, journal_inum))
2038 goto failed_mount2;
2039 } else {
2040 if (!silent)
2041 ext3_msg(sb, KERN_ERR,
2042 "error: no journal found. "
2043 "mounting ext3 over ext2?");
2044 goto failed_mount2;
2045 }
2046 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2047 ext3_count_free_blocks(sb), GFP_KERNEL);
2048 if (!err) {
2049 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2050 ext3_count_free_inodes(sb), GFP_KERNEL);
2051 }
2052 if (!err) {
2053 err = percpu_counter_init(&sbi->s_dirs_counter,
2054 ext3_count_dirs(sb), GFP_KERNEL);
2055 }
2056 if (err) {
2057 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
2058 ret = err;
2059 goto failed_mount3;
2060 }
2061
2062 /* We have now updated the journal if required, so we can
2063 * validate the data journaling mode. */
2064 switch (test_opt(sb, DATA_FLAGS)) {
2065 case 0:
2066 /* No mode set, assume a default based on the journal
2067 capabilities: ORDERED_DATA if the journal can
2068 cope, else JOURNAL_DATA */
2069 if (journal_check_available_features
2070 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
2071 set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
2072 else
2073 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
2074 break;
2075
2076 case EXT3_MOUNT_ORDERED_DATA:
2077 case EXT3_MOUNT_WRITEBACK_DATA:
2078 if (!journal_check_available_features
2079 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
2080 ext3_msg(sb, KERN_ERR,
2081 "error: journal does not support "
2082 "requested data journaling mode");
2083 goto failed_mount3;
2084 }
2085 default:
2086 break;
2087 }
2088
2089 /*
2090 * The journal_load will have done any necessary log recovery,
2091 * so we can safely mount the rest of the filesystem now.
2092 */
2093
2094 root = ext3_iget(sb, EXT3_ROOT_INO);
2095 if (IS_ERR(root)) {
2096 ext3_msg(sb, KERN_ERR, "error: get root inode failed");
2097 ret = PTR_ERR(root);
2098 goto failed_mount3;
2099 }
2100 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
2101 iput(root);
2102 ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
2103 goto failed_mount3;
2104 }
2105 sb->s_root = d_make_root(root);
2106 if (!sb->s_root) {
2107 ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
2108 ret = -ENOMEM;
2109 goto failed_mount3;
2110 }
2111
2112 if (ext3_setup_super(sb, es, sb->s_flags & MS_RDONLY))
2113 sb->s_flags |= MS_RDONLY;
2114
2115 EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
2116 ext3_orphan_cleanup(sb, es);
2117 EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
2118 if (needs_recovery) {
2119 ext3_mark_recovery_complete(sb, es);
2120 ext3_msg(sb, KERN_INFO, "recovery complete");
2121 }
2122 ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode",
2123 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
2124 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
2125 "writeback");
2126
2127 return 0;
2128
2129cantfind_ext3:
2130 if (!silent)
2131 ext3_msg(sb, KERN_INFO,
2132 "error: can't find ext3 filesystem on dev %s.",
2133 sb->s_id);
2134 goto failed_mount;
2135
2136failed_mount3:
2137 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2138 percpu_counter_destroy(&sbi->s_freeinodes_counter);
2139 percpu_counter_destroy(&sbi->s_dirs_counter);
2140 journal_destroy(sbi->s_journal);
2141failed_mount2:
2142 for (i = 0; i < db_count; i++)
2143 brelse(sbi->s_group_desc[i]);
2144 kfree(sbi->s_group_desc);
2145failed_mount:
2146#ifdef CONFIG_QUOTA
2147 for (i = 0; i < EXT3_MAXQUOTAS; i++)
2148 kfree(sbi->s_qf_names[i]);
2149#endif
2150 ext3_blkdev_remove(sbi);
2151 brelse(bh);
2152out_fail:
2153 sb->s_fs_info = NULL;
2154 kfree(sbi->s_blockgroup_lock);
2155 kfree(sbi);
2156 return ret;
2157}
2158
2159/*
2160 * Setup any per-fs journal parameters now. We'll do this both on
2161 * initial mount, once the journal has been initialised but before we've
2162 * done any recovery; and again on any subsequent remount.
2163 */
2164static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
2165{
2166 struct ext3_sb_info *sbi = EXT3_SB(sb);
2167
2168 if (sbi->s_commit_interval)
2169 journal->j_commit_interval = sbi->s_commit_interval;
2170 /* We could also set up an ext3-specific default for the commit
2171 * interval here, but for now we'll just fall back to the jbd
2172 * default. */
2173
2174 spin_lock(&journal->j_state_lock);
2175 if (test_opt(sb, BARRIER))
2176 journal->j_flags |= JFS_BARRIER;
2177 else
2178 journal->j_flags &= ~JFS_BARRIER;
2179 if (test_opt(sb, DATA_ERR_ABORT))
2180 journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR;
2181 else
2182 journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR;
2183 spin_unlock(&journal->j_state_lock);
2184}
2185
2186static journal_t *ext3_get_journal(struct super_block *sb,
2187 unsigned int journal_inum)
2188{
2189 struct inode *journal_inode;
2190 journal_t *journal;
2191
2192 /* First, test for the existence of a valid inode on disk. Bad
2193 * things happen if we iget() an unused inode, as the subsequent
2194 * iput() will try to delete it. */
2195
2196 journal_inode = ext3_iget(sb, journal_inum);
2197 if (IS_ERR(journal_inode)) {
2198 ext3_msg(sb, KERN_ERR, "error: no journal found");
2199 return NULL;
2200 }
2201 if (!journal_inode->i_nlink) {
2202 make_bad_inode(journal_inode);
2203 iput(journal_inode);
2204 ext3_msg(sb, KERN_ERR, "error: journal inode is deleted");
2205 return NULL;
2206 }
2207
2208 jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
2209 journal_inode, journal_inode->i_size);
2210 if (!S_ISREG(journal_inode->i_mode)) {
2211 ext3_msg(sb, KERN_ERR, "error: invalid journal inode");
2212 iput(journal_inode);
2213 return NULL;
2214 }
2215
2216 journal = journal_init_inode(journal_inode);
2217 if (!journal) {
2218 ext3_msg(sb, KERN_ERR, "error: could not load journal inode");
2219 iput(journal_inode);
2220 return NULL;
2221 }
2222 journal->j_private = sb;
2223 ext3_init_journal_params(sb, journal);
2224 return journal;
2225}
2226
2227static journal_t *ext3_get_dev_journal(struct super_block *sb,
2228 dev_t j_dev)
2229{
2230 struct buffer_head * bh;
2231 journal_t *journal;
2232 ext3_fsblk_t start;
2233 ext3_fsblk_t len;
2234 int hblock, blocksize;
2235 ext3_fsblk_t sb_block;
2236 unsigned long offset;
2237 struct ext3_super_block * es;
2238 struct block_device *bdev;
2239
2240 bdev = ext3_blkdev_get(j_dev, sb);
2241 if (bdev == NULL)
2242 return NULL;
2243
2244 blocksize = sb->s_blocksize;
2245 hblock = bdev_logical_block_size(bdev);
2246 if (blocksize < hblock) {
2247 ext3_msg(sb, KERN_ERR,
2248 "error: blocksize too small for journal device");
2249 goto out_bdev;
2250 }
2251
2252 sb_block = EXT3_MIN_BLOCK_SIZE / blocksize;
2253 offset = EXT3_MIN_BLOCK_SIZE % blocksize;
2254 set_blocksize(bdev, blocksize);
2255 if (!(bh = __bread(bdev, sb_block, blocksize))) {
2256 ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of "
2257 "external journal");
2258 goto out_bdev;
2259 }
2260
2261 es = (struct ext3_super_block *) (bh->b_data + offset);
2262 if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
2263 !(le32_to_cpu(es->s_feature_incompat) &
2264 EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
2265 ext3_msg(sb, KERN_ERR, "error: external journal has "
2266 "bad superblock");
2267 brelse(bh);
2268 goto out_bdev;
2269 }
2270
2271 if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
2272 ext3_msg(sb, KERN_ERR, "error: journal UUID does not match");
2273 brelse(bh);
2274 goto out_bdev;
2275 }
2276
2277 len = le32_to_cpu(es->s_blocks_count);
2278 start = sb_block + 1;
2279 brelse(bh); /* we're done with the superblock */
2280
2281 journal = journal_init_dev(bdev, sb->s_bdev,
2282 start, len, blocksize);
2283 if (!journal) {
2284 ext3_msg(sb, KERN_ERR,
2285 "error: failed to create device journal");
2286 goto out_bdev;
2287 }
2288 journal->j_private = sb;
2289 if (!bh_uptodate_or_lock(journal->j_sb_buffer)) {
2290 if (bh_submit_read(journal->j_sb_buffer)) {
2291 ext3_msg(sb, KERN_ERR, "I/O error on journal device");
2292 goto out_journal;
2293 }
2294 }
2295 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
2296 ext3_msg(sb, KERN_ERR,
2297 "error: external journal has more than one "
2298 "user (unsupported) - %d",
2299 be32_to_cpu(journal->j_superblock->s_nr_users));
2300 goto out_journal;
2301 }
2302 EXT3_SB(sb)->journal_bdev = bdev;
2303 ext3_init_journal_params(sb, journal);
2304 return journal;
2305out_journal:
2306 journal_destroy(journal);
2307out_bdev:
2308 ext3_blkdev_put(bdev);
2309 return NULL;
2310}
2311
2312static int ext3_load_journal(struct super_block *sb,
2313 struct ext3_super_block *es,
2314 unsigned long journal_devnum)
2315{
2316 journal_t *journal;
2317 unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
2318 dev_t journal_dev;
2319 int err = 0;
2320 int really_read_only;
2321
2322 if (journal_devnum &&
2323 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2324 ext3_msg(sb, KERN_INFO, "external journal device major/minor "
2325 "numbers have changed");
2326 journal_dev = new_decode_dev(journal_devnum);
2327 } else
2328 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
2329
2330 really_read_only = bdev_read_only(sb->s_bdev);
2331
2332 /*
2333 * Are we loading a blank journal or performing recovery after a
2334 * crash? For recovery, we need to check in advance whether we
2335 * can get read-write access to the device.
2336 */
2337
2338 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
2339 if (sb->s_flags & MS_RDONLY) {
2340 ext3_msg(sb, KERN_INFO,
2341 "recovery required on readonly filesystem");
2342 if (really_read_only) {
2343 ext3_msg(sb, KERN_ERR, "error: write access "
2344 "unavailable, cannot proceed");
2345 return -EROFS;
2346 }
2347 ext3_msg(sb, KERN_INFO,
2348 "write access will be enabled during recovery");
2349 }
2350 }
2351
2352 if (journal_inum && journal_dev) {
2353 ext3_msg(sb, KERN_ERR, "error: filesystem has both journal "
2354 "and inode journals");
2355 return -EINVAL;
2356 }
2357
2358 if (journal_inum) {
2359 if (!(journal = ext3_get_journal(sb, journal_inum)))
2360 return -EINVAL;
2361 } else {
2362 if (!(journal = ext3_get_dev_journal(sb, journal_dev)))
2363 return -EINVAL;
2364 }
2365
2366 if (!(journal->j_flags & JFS_BARRIER))
2367 printk(KERN_INFO "EXT3-fs: barriers not enabled\n");
2368
2369 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
2370 err = journal_update_format(journal);
2371 if (err) {
2372 ext3_msg(sb, KERN_ERR, "error updating journal");
2373 journal_destroy(journal);
2374 return err;
2375 }
2376 }
2377
2378 if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER))
2379 err = journal_wipe(journal, !really_read_only);
2380 if (!err)
2381 err = journal_load(journal);
2382
2383 if (err) {
2384 ext3_msg(sb, KERN_ERR, "error loading journal");
2385 journal_destroy(journal);
2386 return err;
2387 }
2388
2389 EXT3_SB(sb)->s_journal = journal;
2390 ext3_clear_journal_err(sb, es);
2391
2392 if (!really_read_only && journal_devnum &&
2393 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2394 es->s_journal_dev = cpu_to_le32(journal_devnum);
2395
2396 /* Make sure we flush the recovery flag to disk. */
2397 ext3_commit_super(sb, es, 1);
2398 }
2399
2400 return 0;
2401}
2402
2403static int ext3_create_journal(struct super_block *sb,
2404 struct ext3_super_block *es,
2405 unsigned int journal_inum)
2406{
2407 journal_t *journal;
2408 int err;
2409
2410 if (sb->s_flags & MS_RDONLY) {
2411 ext3_msg(sb, KERN_ERR,
2412 "error: readonly filesystem when trying to "
2413 "create journal");
2414 return -EROFS;
2415 }
2416
2417 journal = ext3_get_journal(sb, journal_inum);
2418 if (!journal)
2419 return -EINVAL;
2420
2421 ext3_msg(sb, KERN_INFO, "creating new journal on inode %u",
2422 journal_inum);
2423
2424 err = journal_create(journal);
2425 if (err) {
2426 ext3_msg(sb, KERN_ERR, "error creating journal");
2427 journal_destroy(journal);
2428 return -EIO;
2429 }
2430
2431 EXT3_SB(sb)->s_journal = journal;
2432
2433 ext3_update_dynamic_rev(sb);
2434 EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2435 EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
2436
2437 es->s_journal_inum = cpu_to_le32(journal_inum);
2438
2439 /* Make sure we flush the recovery flag to disk. */
2440 ext3_commit_super(sb, es, 1);
2441
2442 return 0;
2443}
2444
2445static int ext3_commit_super(struct super_block *sb,
2446 struct ext3_super_block *es,
2447 int sync)
2448{
2449 struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
2450 int error = 0;
2451
2452 if (!sbh)
2453 return error;
2454
2455 if (buffer_write_io_error(sbh)) {
2456 /*
2457 * Oh, dear. A previous attempt to write the
2458 * superblock failed. This could happen because the
2459 * USB device was yanked out. Or it could happen to
2460 * be a transient write error and maybe the block will
2461 * be remapped. Nothing we can do but to retry the
2462 * write and hope for the best.
2463 */
2464 ext3_msg(sb, KERN_ERR, "previous I/O error to "
2465 "superblock detected");
2466 clear_buffer_write_io_error(sbh);
2467 set_buffer_uptodate(sbh);
2468 }
2469 /*
2470 * If the file system is mounted read-only, don't update the
2471 * superblock write time. This avoids updating the superblock
2472 * write time when we are mounting the root file system
2473 * read/only but we need to replay the journal; at that point,
2474 * for people who are east of GMT and who make their clock
2475 * tick in localtime for Windows bug-for-bug compatibility,
2476 * the clock is set in the future, and this will cause e2fsck
2477 * to complain and force a full file system check.
2478 */
2479 if (!(sb->s_flags & MS_RDONLY))
2480 es->s_wtime = cpu_to_le32(get_seconds());
2481 es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
2482 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
2483 BUFFER_TRACE(sbh, "marking dirty");
2484 mark_buffer_dirty(sbh);
2485 if (sync) {
2486 error = sync_dirty_buffer(sbh);
2487 if (buffer_write_io_error(sbh)) {
2488 ext3_msg(sb, KERN_ERR, "I/O error while writing "
2489 "superblock");
2490 clear_buffer_write_io_error(sbh);
2491 set_buffer_uptodate(sbh);
2492 }
2493 }
2494 return error;
2495}
2496
2497
2498/*
2499 * Have we just finished recovery? If so, and if we are mounting (or
2500 * remounting) the filesystem readonly, then we will end up with a
2501 * consistent fs on disk. Record that fact.
2502 */
2503static void ext3_mark_recovery_complete(struct super_block * sb,
2504 struct ext3_super_block * es)
2505{
2506 journal_t *journal = EXT3_SB(sb)->s_journal;
2507
2508 journal_lock_updates(journal);
2509 if (journal_flush(journal) < 0)
2510 goto out;
2511
2512 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
2513 sb->s_flags & MS_RDONLY) {
2514 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2515 ext3_commit_super(sb, es, 1);
2516 }
2517
2518out:
2519 journal_unlock_updates(journal);
2520}
2521
2522/*
2523 * If we are mounting (or read-write remounting) a filesystem whose journal
2524 * has recorded an error from a previous lifetime, move that error to the
2525 * main filesystem now.
2526 */
2527static void ext3_clear_journal_err(struct super_block *sb,
2528 struct ext3_super_block *es)
2529{
2530 journal_t *journal;
2531 int j_errno;
2532 const char *errstr;
2533
2534 journal = EXT3_SB(sb)->s_journal;
2535
2536 /*
2537 * Now check for any error status which may have been recorded in the
2538 * journal by a prior ext3_error() or ext3_abort()
2539 */
2540
2541 j_errno = journal_errno(journal);
2542 if (j_errno) {
2543 char nbuf[16];
2544
2545 errstr = ext3_decode_error(sb, j_errno, nbuf);
2546 ext3_warning(sb, __func__, "Filesystem error recorded "
2547 "from previous mount: %s", errstr);
2548 ext3_warning(sb, __func__, "Marking fs in need of "
2549 "filesystem check.");
2550
2551 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
2552 es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
2553 ext3_commit_super (sb, es, 1);
2554
2555 journal_clear_err(journal);
2556 }
2557}
2558
2559/*
2560 * Force the running and committing transactions to commit,
2561 * and wait on the commit.
2562 */
2563int ext3_force_commit(struct super_block *sb)
2564{
2565 journal_t *journal;
2566 int ret;
2567
2568 if (sb->s_flags & MS_RDONLY)
2569 return 0;
2570
2571 journal = EXT3_SB(sb)->s_journal;
2572 ret = ext3_journal_force_commit(journal);
2573 return ret;
2574}
2575
2576static int ext3_sync_fs(struct super_block *sb, int wait)
2577{
2578 tid_t target;
2579
2580 trace_ext3_sync_fs(sb, wait);
2581 /*
2582 * Writeback quota in non-journalled quota case - journalled quota has
2583 * no dirty dquots
2584 */
2585 dquot_writeback_dquots(sb, -1);
2586 if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
2587 if (wait)
2588 log_wait_commit(EXT3_SB(sb)->s_journal, target);
2589 }
2590 return 0;
2591}
2592
2593/*
2594 * LVM calls this function before a (read-only) snapshot is created. This
2595 * gives us a chance to flush the journal completely and mark the fs clean.
2596 */
2597static int ext3_freeze(struct super_block *sb)
2598{
2599 int error = 0;
2600 journal_t *journal;
2601
2602 if (!(sb->s_flags & MS_RDONLY)) {
2603 journal = EXT3_SB(sb)->s_journal;
2604
2605 /* Now we set up the journal barrier. */
2606 journal_lock_updates(journal);
2607
2608 /*
2609 * We don't want to clear needs_recovery flag when we failed
2610 * to flush the journal.
2611 */
2612 error = journal_flush(journal);
2613 if (error < 0)
2614 goto out;
2615
2616 /* Journal blocked and flushed, clear needs_recovery flag. */
2617 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2618 error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
2619 if (error)
2620 goto out;
2621 }
2622 return 0;
2623
2624out:
2625 journal_unlock_updates(journal);
2626 return error;
2627}
2628
2629/*
2630 * Called by LVM after the snapshot is done. We need to reset the RECOVER
2631 * flag here, even though the filesystem is not technically dirty yet.
2632 */
2633static int ext3_unfreeze(struct super_block *sb)
2634{
2635 if (!(sb->s_flags & MS_RDONLY)) {
2636 /* Reser the needs_recovery flag before the fs is unlocked. */
2637 EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2638 ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
2639 journal_unlock_updates(EXT3_SB(sb)->s_journal);
2640 }
2641 return 0;
2642}
2643
2644static int ext3_remount (struct super_block * sb, int * flags, char * data)
2645{
2646 struct ext3_super_block * es;
2647 struct ext3_sb_info *sbi = EXT3_SB(sb);
2648 ext3_fsblk_t n_blocks_count = 0;
2649 unsigned long old_sb_flags;
2650 struct ext3_mount_options old_opts;
2651 int enable_quota = 0;
2652 int err;
2653#ifdef CONFIG_QUOTA
2654 int i;
2655#endif
2656
2657 sync_filesystem(sb);
2658
2659 /* Store the original options */
2660 old_sb_flags = sb->s_flags;
2661 old_opts.s_mount_opt = sbi->s_mount_opt;
2662 old_opts.s_resuid = sbi->s_resuid;
2663 old_opts.s_resgid = sbi->s_resgid;
2664 old_opts.s_commit_interval = sbi->s_commit_interval;
2665#ifdef CONFIG_QUOTA
2666 old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
2667 for (i = 0; i < EXT3_MAXQUOTAS; i++)
2668 if (sbi->s_qf_names[i]) {
2669 old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
2670 GFP_KERNEL);
2671 if (!old_opts.s_qf_names[i]) {
2672 int j;
2673
2674 for (j = 0; j < i; j++)
2675 kfree(old_opts.s_qf_names[j]);
2676 return -ENOMEM;
2677 }
2678 } else
2679 old_opts.s_qf_names[i] = NULL;
2680#endif
2681
2682 /*
2683 * Allow the "check" option to be passed as a remount option.
2684 */
2685 if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
2686 err = -EINVAL;
2687 goto restore_opts;
2688 }
2689
2690 if (test_opt(sb, ABORT))
2691 ext3_abort(sb, __func__, "Abort forced by user");
2692
2693 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
2694 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
2695
2696 es = sbi->s_es;
2697
2698 ext3_init_journal_params(sb, sbi->s_journal);
2699
2700 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
2701 n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
2702 if (test_opt(sb, ABORT)) {
2703 err = -EROFS;
2704 goto restore_opts;
2705 }
2706
2707 if (*flags & MS_RDONLY) {
2708 err = dquot_suspend(sb, -1);
2709 if (err < 0)
2710 goto restore_opts;
2711
2712 /*
2713 * First of all, the unconditional stuff we have to do
2714 * to disable replay of the journal when we next remount
2715 */
2716 sb->s_flags |= MS_RDONLY;
2717
2718 /*
2719 * OK, test if we are remounting a valid rw partition
2720 * readonly, and if so set the rdonly flag and then
2721 * mark the partition as valid again.
2722 */
2723 if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) &&
2724 (sbi->s_mount_state & EXT3_VALID_FS))
2725 es->s_state = cpu_to_le16(sbi->s_mount_state);
2726
2727 ext3_mark_recovery_complete(sb, es);
2728 } else {
2729 __le32 ret;
2730 if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
2731 ~EXT3_FEATURE_RO_COMPAT_SUPP))) {
2732 ext3_msg(sb, KERN_WARNING,
2733 "warning: couldn't remount RDWR "
2734 "because of unsupported optional "
2735 "features (%x)", le32_to_cpu(ret));
2736 err = -EROFS;
2737 goto restore_opts;
2738 }
2739
2740 /*
2741 * If we have an unprocessed orphan list hanging
2742 * around from a previously readonly bdev mount,
2743 * require a full umount & mount for now.
2744 */
2745 if (es->s_last_orphan) {
2746 ext3_msg(sb, KERN_WARNING, "warning: couldn't "
2747 "remount RDWR because of unprocessed "
2748 "orphan inode list. Please "
2749 "umount & mount instead.");
2750 err = -EINVAL;
2751 goto restore_opts;
2752 }
2753
2754 /*
2755 * Mounting a RDONLY partition read-write, so reread
2756 * and store the current valid flag. (It may have
2757 * been changed by e2fsck since we originally mounted
2758 * the partition.)
2759 */
2760 ext3_clear_journal_err(sb, es);
2761 sbi->s_mount_state = le16_to_cpu(es->s_state);
2762 if ((err = ext3_group_extend(sb, es, n_blocks_count)))
2763 goto restore_opts;
2764 if (!ext3_setup_super (sb, es, 0))
2765 sb->s_flags &= ~MS_RDONLY;
2766 enable_quota = 1;
2767 }
2768 }
2769#ifdef CONFIG_QUOTA
2770 /* Release old quota file names */
2771 for (i = 0; i < EXT3_MAXQUOTAS; i++)
2772 kfree(old_opts.s_qf_names[i]);
2773#endif
2774 if (enable_quota)
2775 dquot_resume(sb, -1);
2776 return 0;
2777restore_opts:
2778 sb->s_flags = old_sb_flags;
2779 sbi->s_mount_opt = old_opts.s_mount_opt;
2780 sbi->s_resuid = old_opts.s_resuid;
2781 sbi->s_resgid = old_opts.s_resgid;
2782 sbi->s_commit_interval = old_opts.s_commit_interval;
2783#ifdef CONFIG_QUOTA
2784 sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
2785 for (i = 0; i < EXT3_MAXQUOTAS; i++) {
2786 kfree(sbi->s_qf_names[i]);
2787 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
2788 }
2789#endif
2790 return err;
2791}
2792
2793static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
2794{
2795 struct super_block *sb = dentry->d_sb;
2796 struct ext3_sb_info *sbi = EXT3_SB(sb);
2797 struct ext3_super_block *es = sbi->s_es;
2798 u64 fsid;
2799
2800 if (test_opt(sb, MINIX_DF)) {
2801 sbi->s_overhead_last = 0;
2802 } else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
2803 unsigned long ngroups = sbi->s_groups_count, i;
2804 ext3_fsblk_t overhead = 0;
2805 smp_rmb();
2806
2807 /*
2808 * Compute the overhead (FS structures). This is constant
2809 * for a given filesystem unless the number of block groups
2810 * changes so we cache the previous value until it does.
2811 */
2812
2813 /*
2814 * All of the blocks before first_data_block are
2815 * overhead
2816 */
2817 overhead = le32_to_cpu(es->s_first_data_block);
2818
2819 /*
2820 * Add the overhead attributed to the superblock and
2821 * block group descriptors. If the sparse superblocks
2822 * feature is turned on, then not all groups have this.
2823 */
2824 for (i = 0; i < ngroups; i++) {
2825 overhead += ext3_bg_has_super(sb, i) +
2826 ext3_bg_num_gdb(sb, i);
2827 cond_resched();
2828 }
2829
2830 /*
2831 * Every block group has an inode bitmap, a block
2832 * bitmap, and an inode table.
2833 */
2834 overhead += ngroups * (2 + sbi->s_itb_per_group);
2835
2836 /* Add the internal journal blocks as well */
2837 if (sbi->s_journal && !sbi->journal_bdev)
2838 overhead += sbi->s_journal->j_maxlen;
2839
2840 sbi->s_overhead_last = overhead;
2841 smp_wmb();
2842 sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
2843 }
2844
2845 buf->f_type = EXT3_SUPER_MAGIC;
2846 buf->f_bsize = sb->s_blocksize;
2847 buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
2848 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
2849 buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
2850 if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
2851 buf->f_bavail = 0;
2852 buf->f_files = le32_to_cpu(es->s_inodes_count);
2853 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
2854 buf->f_namelen = EXT3_NAME_LEN;
2855 fsid = le64_to_cpup((void *)es->s_uuid) ^
2856 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
2857 buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
2858 buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
2859 return 0;
2860}
2861
2862/* Helper function for writing quotas on sync - we need to start transaction before quota file
2863 * is locked for write. Otherwise the are possible deadlocks:
2864 * Process 1 Process 2
2865 * ext3_create() quota_sync()
2866 * journal_start() write_dquot()
2867 * dquot_initialize() down(dqio_mutex)
2868 * down(dqio_mutex) journal_start()
2869 *
2870 */
2871
2872#ifdef CONFIG_QUOTA
2873
2874static inline struct inode *dquot_to_inode(struct dquot *dquot)
2875{
2876 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
2877}
2878
2879static int ext3_write_dquot(struct dquot *dquot)
2880{
2881 int ret, err;
2882 handle_t *handle;
2883 struct inode *inode;
2884
2885 inode = dquot_to_inode(dquot);
2886 handle = ext3_journal_start(inode,
2887 EXT3_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
2888 if (IS_ERR(handle))
2889 return PTR_ERR(handle);
2890 ret = dquot_commit(dquot);
2891 err = ext3_journal_stop(handle);
2892 if (!ret)
2893 ret = err;
2894 return ret;
2895}
2896
2897static int ext3_acquire_dquot(struct dquot *dquot)
2898{
2899 int ret, err;
2900 handle_t *handle;
2901
2902 handle = ext3_journal_start(dquot_to_inode(dquot),
2903 EXT3_QUOTA_INIT_BLOCKS(dquot->dq_sb));
2904 if (IS_ERR(handle))
2905 return PTR_ERR(handle);
2906 ret = dquot_acquire(dquot);
2907 err = ext3_journal_stop(handle);
2908 if (!ret)
2909 ret = err;
2910 return ret;
2911}
2912
2913static int ext3_release_dquot(struct dquot *dquot)
2914{
2915 int ret, err;
2916 handle_t *handle;
2917
2918 handle = ext3_journal_start(dquot_to_inode(dquot),
2919 EXT3_QUOTA_DEL_BLOCKS(dquot->dq_sb));
2920 if (IS_ERR(handle)) {
2921 /* Release dquot anyway to avoid endless cycle in dqput() */
2922 dquot_release(dquot);
2923 return PTR_ERR(handle);
2924 }
2925 ret = dquot_release(dquot);
2926 err = ext3_journal_stop(handle);
2927 if (!ret)
2928 ret = err;
2929 return ret;
2930}
2931
2932static int ext3_mark_dquot_dirty(struct dquot *dquot)
2933{
2934 /* Are we journaling quotas? */
2935 if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
2936 EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
2937 dquot_mark_dquot_dirty(dquot);
2938 return ext3_write_dquot(dquot);
2939 } else {
2940 return dquot_mark_dquot_dirty(dquot);
2941 }
2942}
2943
2944static int ext3_write_info(struct super_block *sb, int type)
2945{
2946 int ret, err;
2947 handle_t *handle;
2948
2949 /* Data block + inode block */
2950 handle = ext3_journal_start(d_inode(sb->s_root), 2);
2951 if (IS_ERR(handle))
2952 return PTR_ERR(handle);
2953 ret = dquot_commit_info(sb, type);
2954 err = ext3_journal_stop(handle);
2955 if (!ret)
2956 ret = err;
2957 return ret;
2958}
2959
2960/*
2961 * Turn on quotas during mount time - we need to find
2962 * the quota file and such...
2963 */
2964static int ext3_quota_on_mount(struct super_block *sb, int type)
2965{
2966 return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
2967 EXT3_SB(sb)->s_jquota_fmt, type);
2968}
2969
2970/*
2971 * Standard function to be called on quota_on
2972 */
2973static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2974 struct path *path)
2975{
2976 int err;
2977
2978 if (!test_opt(sb, QUOTA))
2979 return -EINVAL;
2980
2981 /* Quotafile not on the same filesystem? */
2982 if (path->dentry->d_sb != sb)
2983 return -EXDEV;
2984 /* Journaling quota? */
2985 if (EXT3_SB(sb)->s_qf_names[type]) {
2986 /* Quotafile not of fs root? */
2987 if (path->dentry->d_parent != sb->s_root)
2988 ext3_msg(sb, KERN_WARNING,
2989 "warning: Quota file not on filesystem root. "
2990 "Journaled quota will not work.");
2991 }
2992
2993 /*
2994 * When we journal data on quota file, we have to flush journal to see
2995 * all updates to the file when we bypass pagecache...
2996 */
2997 if (ext3_should_journal_data(d_inode(path->dentry))) {
2998 /*
2999 * We don't need to lock updates but journal_flush() could
3000 * otherwise be livelocked...
3001 */
3002 journal_lock_updates(EXT3_SB(sb)->s_journal);
3003 err = journal_flush(EXT3_SB(sb)->s_journal);
3004 journal_unlock_updates(EXT3_SB(sb)->s_journal);
3005 if (err)
3006 return err;
3007 }
3008
3009 return dquot_quota_on(sb, type, format_id, path);
3010}
3011
3012/* Read data from quotafile - avoid pagecache and such because we cannot afford
3013 * acquiring the locks... As quota files are never truncated and quota code
3014 * itself serializes the operations (and no one else should touch the files)
3015 * we don't have to be afraid of races */
3016static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
3017 size_t len, loff_t off)
3018{
3019 struct inode *inode = sb_dqopt(sb)->files[type];
3020 sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
3021 int err = 0;
3022 int offset = off & (sb->s_blocksize - 1);
3023 int tocopy;
3024 size_t toread;
3025 struct buffer_head *bh;
3026 loff_t i_size = i_size_read(inode);
3027
3028 if (off > i_size)
3029 return 0;
3030 if (off+len > i_size)
3031 len = i_size-off;
3032 toread = len;
3033 while (toread > 0) {
3034 tocopy = sb->s_blocksize - offset < toread ?
3035 sb->s_blocksize - offset : toread;
3036 bh = ext3_bread(NULL, inode, blk, 0, &err);
3037 if (err)
3038 return err;
3039 if (!bh) /* A hole? */
3040 memset(data, 0, tocopy);
3041 else
3042 memcpy(data, bh->b_data+offset, tocopy);
3043 brelse(bh);
3044 offset = 0;
3045 toread -= tocopy;
3046 data += tocopy;
3047 blk++;
3048 }
3049 return len;
3050}
3051
3052/* Write to quotafile (we know the transaction is already started and has
3053 * enough credits) */
3054static ssize_t ext3_quota_write(struct super_block *sb, int type,
3055 const char *data, size_t len, loff_t off)
3056{
3057 struct inode *inode = sb_dqopt(sb)->files[type];
3058 sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
3059 int err = 0;
3060 int offset = off & (sb->s_blocksize - 1);
3061 int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
3062 struct buffer_head *bh;
3063 handle_t *handle = journal_current_handle();
3064
3065 if (!handle) {
3066 ext3_msg(sb, KERN_WARNING,
3067 "warning: quota write (off=%llu, len=%llu)"
3068 " cancelled because transaction is not started.",
3069 (unsigned long long)off, (unsigned long long)len);
3070 return -EIO;
3071 }
3072
3073 /*
3074 * Since we account only one data block in transaction credits,
3075 * then it is impossible to cross a block boundary.
3076 */
3077 if (sb->s_blocksize - offset < len) {
3078 ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
3079 " cancelled because not block aligned",
3080 (unsigned long long)off, (unsigned long long)len);
3081 return -EIO;
3082 }
3083 bh = ext3_bread(handle, inode, blk, 1, &err);
3084 if (!bh)
3085 goto out;
3086 if (journal_quota) {
3087 err = ext3_journal_get_write_access(handle, bh);
3088 if (err) {
3089 brelse(bh);
3090 goto out;
3091 }
3092 }
3093 lock_buffer(bh);
3094 memcpy(bh->b_data+offset, data, len);
3095 flush_dcache_page(bh->b_page);
3096 unlock_buffer(bh);
3097 if (journal_quota)
3098 err = ext3_journal_dirty_metadata(handle, bh);
3099 else {
3100 /* Always do at least ordered writes for quotas */
3101 err = ext3_journal_dirty_data(handle, bh);
3102 mark_buffer_dirty(bh);
3103 }
3104 brelse(bh);
3105out:
3106 if (err)
3107 return err;
3108 if (inode->i_size < off + len) {
3109 i_size_write(inode, off + len);
3110 EXT3_I(inode)->i_disksize = inode->i_size;
3111 }
3112 inode->i_version++;
3113 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
3114 ext3_mark_inode_dirty(handle, inode);
3115 return len;
3116}
3117
3118#endif
3119
3120static struct dentry *ext3_mount(struct file_system_type *fs_type,
3121 int flags, const char *dev_name, void *data)
3122{
3123 return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
3124}
3125
3126static struct file_system_type ext3_fs_type = {
3127 .owner = THIS_MODULE,
3128 .name = "ext3",
3129 .mount = ext3_mount,
3130 .kill_sb = kill_block_super,
3131 .fs_flags = FS_REQUIRES_DEV,
3132};
3133MODULE_ALIAS_FS("ext3");
3134
3135static int __init init_ext3_fs(void)
3136{
3137 int err = init_ext3_xattr();
3138 if (err)
3139 return err;
3140 err = init_inodecache();
3141 if (err)
3142 goto out1;
3143 err = register_filesystem(&ext3_fs_type);
3144 if (err)
3145 goto out;
3146 return 0;
3147out:
3148 destroy_inodecache();
3149out1:
3150 exit_ext3_xattr();
3151 return err;
3152}
3153
3154static void __exit exit_ext3_fs(void)
3155{
3156 unregister_filesystem(&ext3_fs_type);
3157 destroy_inodecache();
3158 exit_ext3_xattr();
3159}
3160
3161MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
3162MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
3163MODULE_LICENSE("GPL");
3164module_init(init_ext3_fs)
3165module_exit(exit_ext3_fs)
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
deleted file mode 100644
index c08c59094ae6..000000000000
--- a/fs/ext3/symlink.c
+++ /dev/null
@@ -1,46 +0,0 @@
1/*
2 * linux/fs/ext3/symlink.c
3 *
4 * Only fast symlinks left here - the rest is done by generic code. AV, 1999
5 *
6 * Copyright (C) 1992, 1993, 1994, 1995
7 * Remy Card (card@masi.ibp.fr)
8 * Laboratoire MASI - Institut Blaise Pascal
9 * Universite Pierre et Marie Curie (Paris VI)
10 *
11 * from
12 *
13 * linux/fs/minix/symlink.c
14 *
15 * Copyright (C) 1991, 1992 Linus Torvalds
16 *
17 * ext3 symlink handling code
18 */
19
20#include "ext3.h"
21#include "xattr.h"
22
23const struct inode_operations ext3_symlink_inode_operations = {
24 .readlink = generic_readlink,
25 .follow_link = page_follow_link_light,
26 .put_link = page_put_link,
27 .setattr = ext3_setattr,
28#ifdef CONFIG_EXT3_FS_XATTR
29 .setxattr = generic_setxattr,
30 .getxattr = generic_getxattr,
31 .listxattr = ext3_listxattr,
32 .removexattr = generic_removexattr,
33#endif
34};
35
36const struct inode_operations ext3_fast_symlink_inode_operations = {
37 .readlink = generic_readlink,
38 .follow_link = simple_follow_link,
39 .setattr = ext3_setattr,
40#ifdef CONFIG_EXT3_FS_XATTR
41 .setxattr = generic_setxattr,
42 .getxattr = generic_getxattr,
43 .listxattr = ext3_listxattr,
44 .removexattr = generic_removexattr,
45#endif
46};
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
deleted file mode 100644
index 7cf36501ccf4..000000000000
--- a/fs/ext3/xattr.c
+++ /dev/null
@@ -1,1330 +0,0 @@
1/*
2 * linux/fs/ext3/xattr.c
3 *
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 *
6 * Fix by Harrison Xing <harrison@mountainviewdata.com>.
7 * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
8 * Extended attributes for symlinks and special files added per
9 * suggestion of Luka Renko <luka.renko@hermes.si>.
10 * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
11 * Red Hat Inc.
12 * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
13 * and Andreas Gruenbacher <agruen@suse.de>.
14 */
15
16/*
17 * Extended attributes are stored directly in inodes (on file systems with
18 * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
19 * field contains the block number if an inode uses an additional block. All
20 * attributes must fit in the inode and one additional block. Blocks that
21 * contain the identical set of attributes may be shared among several inodes.
22 * Identical blocks are detected by keeping a cache of blocks that have
23 * recently been accessed.
24 *
25 * The attributes in inodes and on blocks have a different header; the entries
26 * are stored in the same format:
27 *
28 * +------------------+
29 * | header |
30 * | entry 1 | |
31 * | entry 2 | | growing downwards
32 * | entry 3 | v
33 * | four null bytes |
34 * | . . . |
35 * | value 1 | ^
36 * | value 3 | | growing upwards
37 * | value 2 | |
38 * +------------------+
39 *
40 * The header is followed by multiple entry descriptors. In disk blocks, the
41 * entry descriptors are kept sorted. In inodes, they are unsorted. The
42 * attribute values are aligned to the end of the block in no specific order.
43 *
44 * Locking strategy
45 * ----------------
46 * EXT3_I(inode)->i_file_acl is protected by EXT3_I(inode)->xattr_sem.
47 * EA blocks are only changed if they are exclusive to an inode, so
48 * holding xattr_sem also means that nothing but the EA block's reference
49 * count can change. Multiple writers to the same block are synchronized
50 * by the buffer lock.
51 */
52
53#include "ext3.h"
54#include <linux/mbcache.h>
55#include <linux/quotaops.h>
56#include "xattr.h"
57#include "acl.h"
58
59#define BHDR(bh) ((struct ext3_xattr_header *)((bh)->b_data))
60#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr))
61#define BFIRST(bh) ENTRY(BHDR(bh)+1)
62#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
63
64#define IHDR(inode, raw_inode) \
65 ((struct ext3_xattr_ibody_header *) \
66 ((void *)raw_inode + \
67 EXT3_GOOD_OLD_INODE_SIZE + \
68 EXT3_I(inode)->i_extra_isize))
69#define IFIRST(hdr) ((struct ext3_xattr_entry *)((hdr)+1))
70
71#ifdef EXT3_XATTR_DEBUG
72# define ea_idebug(inode, f...) do { \
73 printk(KERN_DEBUG "inode %s:%lu: ", \
74 inode->i_sb->s_id, inode->i_ino); \
75 printk(f); \
76 printk("\n"); \
77 } while (0)
78# define ea_bdebug(bh, f...) do { \
79 char b[BDEVNAME_SIZE]; \
80 printk(KERN_DEBUG "block %s:%lu: ", \
81 bdevname(bh->b_bdev, b), \
82 (unsigned long) bh->b_blocknr); \
83 printk(f); \
84 printk("\n"); \
85 } while (0)
86#else
87# define ea_idebug(f...)
88# define ea_bdebug(f...)
89#endif
90
91static void ext3_xattr_cache_insert(struct buffer_head *);
92static struct buffer_head *ext3_xattr_cache_find(struct inode *,
93 struct ext3_xattr_header *,
94 struct mb_cache_entry **);
95static void ext3_xattr_rehash(struct ext3_xattr_header *,
96 struct ext3_xattr_entry *);
97static int ext3_xattr_list(struct dentry *dentry, char *buffer,
98 size_t buffer_size);
99
100static struct mb_cache *ext3_xattr_cache;
101
102static const struct xattr_handler *ext3_xattr_handler_map[] = {
103 [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler,
104#ifdef CONFIG_EXT3_FS_POSIX_ACL
105 [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
106 [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
107#endif
108 [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler,
109#ifdef CONFIG_EXT3_FS_SECURITY
110 [EXT3_XATTR_INDEX_SECURITY] = &ext3_xattr_security_handler,
111#endif
112};
113
114const struct xattr_handler *ext3_xattr_handlers[] = {
115 &ext3_xattr_user_handler,
116 &ext3_xattr_trusted_handler,
117#ifdef CONFIG_EXT3_FS_POSIX_ACL
118 &posix_acl_access_xattr_handler,
119 &posix_acl_default_xattr_handler,
120#endif
121#ifdef CONFIG_EXT3_FS_SECURITY
122 &ext3_xattr_security_handler,
123#endif
124 NULL
125};
126
127static inline const struct xattr_handler *
128ext3_xattr_handler(int name_index)
129{
130 const struct xattr_handler *handler = NULL;
131
132 if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
133 handler = ext3_xattr_handler_map[name_index];
134 return handler;
135}
136
137/*
138 * Inode operation listxattr()
139 *
140 * d_inode(dentry)->i_mutex: don't care
141 */
142ssize_t
143ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
144{
145 return ext3_xattr_list(dentry, buffer, size);
146}
147
148static int
149ext3_xattr_check_names(struct ext3_xattr_entry *entry, void *end)
150{
151 while (!IS_LAST_ENTRY(entry)) {
152 struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(entry);
153 if ((void *)next >= end)
154 return -EIO;
155 entry = next;
156 }
157 return 0;
158}
159
160static inline int
161ext3_xattr_check_block(struct buffer_head *bh)
162{
163 int error;
164
165 if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
166 BHDR(bh)->h_blocks != cpu_to_le32(1))
167 return -EIO;
168 error = ext3_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
169 return error;
170}
171
172static inline int
173ext3_xattr_check_entry(struct ext3_xattr_entry *entry, size_t size)
174{
175 size_t value_size = le32_to_cpu(entry->e_value_size);
176
177 if (entry->e_value_block != 0 || value_size > size ||
178 le16_to_cpu(entry->e_value_offs) + value_size > size)
179 return -EIO;
180 return 0;
181}
182
183static int
184ext3_xattr_find_entry(struct ext3_xattr_entry **pentry, int name_index,
185 const char *name, size_t size, int sorted)
186{
187 struct ext3_xattr_entry *entry;
188 size_t name_len;
189 int cmp = 1;
190
191 if (name == NULL)
192 return -EINVAL;
193 name_len = strlen(name);
194 entry = *pentry;
195 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
196 cmp = name_index - entry->e_name_index;
197 if (!cmp)
198 cmp = name_len - entry->e_name_len;
199 if (!cmp)
200 cmp = memcmp(name, entry->e_name, name_len);
201 if (cmp <= 0 && (sorted || cmp == 0))
202 break;
203 }
204 *pentry = entry;
205 if (!cmp && ext3_xattr_check_entry(entry, size))
206 return -EIO;
207 return cmp ? -ENODATA : 0;
208}
209
210static int
211ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
212 void *buffer, size_t buffer_size)
213{
214 struct buffer_head *bh = NULL;
215 struct ext3_xattr_entry *entry;
216 size_t size;
217 int error;
218
219 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
220 name_index, name, buffer, (long)buffer_size);
221
222 error = -ENODATA;
223 if (!EXT3_I(inode)->i_file_acl)
224 goto cleanup;
225 ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
226 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
227 if (!bh)
228 goto cleanup;
229 ea_bdebug(bh, "b_count=%d, refcount=%d",
230 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
231 if (ext3_xattr_check_block(bh)) {
232bad_block: ext3_error(inode->i_sb, __func__,
233 "inode %lu: bad block "E3FSBLK, inode->i_ino,
234 EXT3_I(inode)->i_file_acl);
235 error = -EIO;
236 goto cleanup;
237 }
238 ext3_xattr_cache_insert(bh);
239 entry = BFIRST(bh);
240 error = ext3_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
241 if (error == -EIO)
242 goto bad_block;
243 if (error)
244 goto cleanup;
245 size = le32_to_cpu(entry->e_value_size);
246 if (buffer) {
247 error = -ERANGE;
248 if (size > buffer_size)
249 goto cleanup;
250 memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
251 size);
252 }
253 error = size;
254
255cleanup:
256 brelse(bh);
257 return error;
258}
259
260static int
261ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
262 void *buffer, size_t buffer_size)
263{
264 struct ext3_xattr_ibody_header *header;
265 struct ext3_xattr_entry *entry;
266 struct ext3_inode *raw_inode;
267 struct ext3_iloc iloc;
268 size_t size;
269 void *end;
270 int error;
271
272 if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
273 return -ENODATA;
274 error = ext3_get_inode_loc(inode, &iloc);
275 if (error)
276 return error;
277 raw_inode = ext3_raw_inode(&iloc);
278 header = IHDR(inode, raw_inode);
279 entry = IFIRST(header);
280 end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
281 error = ext3_xattr_check_names(entry, end);
282 if (error)
283 goto cleanup;
284 error = ext3_xattr_find_entry(&entry, name_index, name,
285 end - (void *)entry, 0);
286 if (error)
287 goto cleanup;
288 size = le32_to_cpu(entry->e_value_size);
289 if (buffer) {
290 error = -ERANGE;
291 if (size > buffer_size)
292 goto cleanup;
293 memcpy(buffer, (void *)IFIRST(header) +
294 le16_to_cpu(entry->e_value_offs), size);
295 }
296 error = size;
297
298cleanup:
299 brelse(iloc.bh);
300 return error;
301}
302
303/*
304 * ext3_xattr_get()
305 *
306 * Copy an extended attribute into the buffer
307 * provided, or compute the buffer size required.
308 * Buffer is NULL to compute the size of the buffer required.
309 *
310 * Returns a negative error number on failure, or the number of bytes
311 * used / required on success.
312 */
313int
314ext3_xattr_get(struct inode *inode, int name_index, const char *name,
315 void *buffer, size_t buffer_size)
316{
317 int error;
318
319 down_read(&EXT3_I(inode)->xattr_sem);
320 error = ext3_xattr_ibody_get(inode, name_index, name, buffer,
321 buffer_size);
322 if (error == -ENODATA)
323 error = ext3_xattr_block_get(inode, name_index, name, buffer,
324 buffer_size);
325 up_read(&EXT3_I(inode)->xattr_sem);
326 return error;
327}
328
329static int
330ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
331 char *buffer, size_t buffer_size)
332{
333 size_t rest = buffer_size;
334
335 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
336 const struct xattr_handler *handler =
337 ext3_xattr_handler(entry->e_name_index);
338
339 if (handler) {
340 size_t size = handler->list(dentry, buffer, rest,
341 entry->e_name,
342 entry->e_name_len,
343 handler->flags);
344 if (buffer) {
345 if (size > rest)
346 return -ERANGE;
347 buffer += size;
348 }
349 rest -= size;
350 }
351 }
352 return buffer_size - rest;
353}
354
355static int
356ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
357{
358 struct inode *inode = d_inode(dentry);
359 struct buffer_head *bh = NULL;
360 int error;
361
362 ea_idebug(inode, "buffer=%p, buffer_size=%ld",
363 buffer, (long)buffer_size);
364
365 error = 0;
366 if (!EXT3_I(inode)->i_file_acl)
367 goto cleanup;
368 ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
369 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
370 error = -EIO;
371 if (!bh)
372 goto cleanup;
373 ea_bdebug(bh, "b_count=%d, refcount=%d",
374 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
375 if (ext3_xattr_check_block(bh)) {
376 ext3_error(inode->i_sb, __func__,
377 "inode %lu: bad block "E3FSBLK, inode->i_ino,
378 EXT3_I(inode)->i_file_acl);
379 error = -EIO;
380 goto cleanup;
381 }
382 ext3_xattr_cache_insert(bh);
383 error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
384
385cleanup:
386 brelse(bh);
387
388 return error;
389}
390
391static int
392ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
393{
394 struct inode *inode = d_inode(dentry);
395 struct ext3_xattr_ibody_header *header;
396 struct ext3_inode *raw_inode;
397 struct ext3_iloc iloc;
398 void *end;
399 int error;
400
401 if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
402 return 0;
403 error = ext3_get_inode_loc(inode, &iloc);
404 if (error)
405 return error;
406 raw_inode = ext3_raw_inode(&iloc);
407 header = IHDR(inode, raw_inode);
408 end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
409 error = ext3_xattr_check_names(IFIRST(header), end);
410 if (error)
411 goto cleanup;
412 error = ext3_xattr_list_entries(dentry, IFIRST(header),
413 buffer, buffer_size);
414
415cleanup:
416 brelse(iloc.bh);
417 return error;
418}
419
420/*
421 * ext3_xattr_list()
422 *
423 * Copy a list of attribute names into the buffer
424 * provided, or compute the buffer size required.
425 * Buffer is NULL to compute the size of the buffer required.
426 *
427 * Returns a negative error number on failure, or the number of bytes
428 * used / required on success.
429 */
430static int
431ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
432{
433 int i_error, b_error;
434
435 down_read(&EXT3_I(d_inode(dentry))->xattr_sem);
436 i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size);
437 if (i_error < 0) {
438 b_error = 0;
439 } else {
440 if (buffer) {
441 buffer += i_error;
442 buffer_size -= i_error;
443 }
444 b_error = ext3_xattr_block_list(dentry, buffer, buffer_size);
445 if (b_error < 0)
446 i_error = 0;
447 }
448 up_read(&EXT3_I(d_inode(dentry))->xattr_sem);
449 return i_error + b_error;
450}
451
452/*
453 * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is
454 * not set, set it.
455 */
456static void ext3_xattr_update_super_block(handle_t *handle,
457 struct super_block *sb)
458{
459 if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
460 return;
461
462 if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
463 EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR);
464 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
465 }
466}
467
468/*
469 * Release the xattr block BH: If the reference count is > 1, decrement
470 * it; otherwise free the block.
471 */
472static void
473ext3_xattr_release_block(handle_t *handle, struct inode *inode,
474 struct buffer_head *bh)
475{
476 struct mb_cache_entry *ce = NULL;
477 int error = 0;
478
479 ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, bh->b_blocknr);
480 error = ext3_journal_get_write_access(handle, bh);
481 if (error)
482 goto out;
483
484 lock_buffer(bh);
485
486 if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
487 ea_bdebug(bh, "refcount now=0; freeing");
488 if (ce)
489 mb_cache_entry_free(ce);
490 ext3_free_blocks(handle, inode, bh->b_blocknr, 1);
491 get_bh(bh);
492 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
493 } else {
494 le32_add_cpu(&BHDR(bh)->h_refcount, -1);
495 error = ext3_journal_dirty_metadata(handle, bh);
496 if (IS_SYNC(inode))
497 handle->h_sync = 1;
498 dquot_free_block(inode, 1);
499 ea_bdebug(bh, "refcount now=%d; releasing",
500 le32_to_cpu(BHDR(bh)->h_refcount));
501 if (ce)
502 mb_cache_entry_release(ce);
503 }
504 unlock_buffer(bh);
505out:
506 ext3_std_error(inode->i_sb, error);
507 return;
508}
509
510struct ext3_xattr_info {
511 int name_index;
512 const char *name;
513 const void *value;
514 size_t value_len;
515};
516
517struct ext3_xattr_search {
518 struct ext3_xattr_entry *first;
519 void *base;
520 void *end;
521 struct ext3_xattr_entry *here;
522 int not_found;
523};
524
525static int
526ext3_xattr_set_entry(struct ext3_xattr_info *i, struct ext3_xattr_search *s)
527{
528 struct ext3_xattr_entry *last;
529 size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
530
531 /* Compute min_offs and last. */
532 last = s->first;
533 for (; !IS_LAST_ENTRY(last); last = EXT3_XATTR_NEXT(last)) {
534 if (!last->e_value_block && last->e_value_size) {
535 size_t offs = le16_to_cpu(last->e_value_offs);
536 if (offs < min_offs)
537 min_offs = offs;
538 }
539 }
540 free = min_offs - ((void *)last - s->base) - sizeof(__u32);
541 if (!s->not_found) {
542 if (!s->here->e_value_block && s->here->e_value_size) {
543 size_t size = le32_to_cpu(s->here->e_value_size);
544 free += EXT3_XATTR_SIZE(size);
545 }
546 free += EXT3_XATTR_LEN(name_len);
547 }
548 if (i->value) {
549 if (free < EXT3_XATTR_LEN(name_len) +
550 EXT3_XATTR_SIZE(i->value_len))
551 return -ENOSPC;
552 }
553
554 if (i->value && s->not_found) {
555 /* Insert the new name. */
556 size_t size = EXT3_XATTR_LEN(name_len);
557 size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
558 memmove((void *)s->here + size, s->here, rest);
559 memset(s->here, 0, size);
560 s->here->e_name_index = i->name_index;
561 s->here->e_name_len = name_len;
562 memcpy(s->here->e_name, i->name, name_len);
563 } else {
564 if (!s->here->e_value_block && s->here->e_value_size) {
565 void *first_val = s->base + min_offs;
566 size_t offs = le16_to_cpu(s->here->e_value_offs);
567 void *val = s->base + offs;
568 size_t size = EXT3_XATTR_SIZE(
569 le32_to_cpu(s->here->e_value_size));
570
571 if (i->value && size == EXT3_XATTR_SIZE(i->value_len)) {
572 /* The old and the new value have the same
573 size. Just replace. */
574 s->here->e_value_size =
575 cpu_to_le32(i->value_len);
576 memset(val + size - EXT3_XATTR_PAD, 0,
577 EXT3_XATTR_PAD); /* Clear pad bytes. */
578 memcpy(val, i->value, i->value_len);
579 return 0;
580 }
581
582 /* Remove the old value. */
583 memmove(first_val + size, first_val, val - first_val);
584 memset(first_val, 0, size);
585 s->here->e_value_size = 0;
586 s->here->e_value_offs = 0;
587 min_offs += size;
588
589 /* Adjust all value offsets. */
590 last = s->first;
591 while (!IS_LAST_ENTRY(last)) {
592 size_t o = le16_to_cpu(last->e_value_offs);
593 if (!last->e_value_block &&
594 last->e_value_size && o < offs)
595 last->e_value_offs =
596 cpu_to_le16(o + size);
597 last = EXT3_XATTR_NEXT(last);
598 }
599 }
600 if (!i->value) {
601 /* Remove the old name. */
602 size_t size = EXT3_XATTR_LEN(name_len);
603 last = ENTRY((void *)last - size);
604 memmove(s->here, (void *)s->here + size,
605 (void *)last - (void *)s->here + sizeof(__u32));
606 memset(last, 0, size);
607 }
608 }
609
610 if (i->value) {
611 /* Insert the new value. */
612 s->here->e_value_size = cpu_to_le32(i->value_len);
613 if (i->value_len) {
614 size_t size = EXT3_XATTR_SIZE(i->value_len);
615 void *val = s->base + min_offs - size;
616 s->here->e_value_offs = cpu_to_le16(min_offs - size);
617 memset(val + size - EXT3_XATTR_PAD, 0,
618 EXT3_XATTR_PAD); /* Clear the pad bytes. */
619 memcpy(val, i->value, i->value_len);
620 }
621 }
622 return 0;
623}
624
625struct ext3_xattr_block_find {
626 struct ext3_xattr_search s;
627 struct buffer_head *bh;
628};
629
630static int
631ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
632 struct ext3_xattr_block_find *bs)
633{
634 struct super_block *sb = inode->i_sb;
635 int error;
636
637 ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
638 i->name_index, i->name, i->value, (long)i->value_len);
639
640 if (EXT3_I(inode)->i_file_acl) {
641 /* The inode already has an extended attribute block. */
642 bs->bh = sb_bread(sb, EXT3_I(inode)->i_file_acl);
643 error = -EIO;
644 if (!bs->bh)
645 goto cleanup;
646 ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
647 atomic_read(&(bs->bh->b_count)),
648 le32_to_cpu(BHDR(bs->bh)->h_refcount));
649 if (ext3_xattr_check_block(bs->bh)) {
650 ext3_error(sb, __func__,
651 "inode %lu: bad block "E3FSBLK, inode->i_ino,
652 EXT3_I(inode)->i_file_acl);
653 error = -EIO;
654 goto cleanup;
655 }
656 /* Find the named attribute. */
657 bs->s.base = BHDR(bs->bh);
658 bs->s.first = BFIRST(bs->bh);
659 bs->s.end = bs->bh->b_data + bs->bh->b_size;
660 bs->s.here = bs->s.first;
661 error = ext3_xattr_find_entry(&bs->s.here, i->name_index,
662 i->name, bs->bh->b_size, 1);
663 if (error && error != -ENODATA)
664 goto cleanup;
665 bs->s.not_found = error;
666 }
667 error = 0;
668
669cleanup:
670 return error;
671}
672
673static int
674ext3_xattr_block_set(handle_t *handle, struct inode *inode,
675 struct ext3_xattr_info *i,
676 struct ext3_xattr_block_find *bs)
677{
678 struct super_block *sb = inode->i_sb;
679 struct buffer_head *new_bh = NULL;
680 struct ext3_xattr_search *s = &bs->s;
681 struct mb_cache_entry *ce = NULL;
682 int error = 0;
683
684#define header(x) ((struct ext3_xattr_header *)(x))
685
686 if (i->value && i->value_len > sb->s_blocksize)
687 return -ENOSPC;
688 if (s->base) {
689 ce = mb_cache_entry_get(ext3_xattr_cache, bs->bh->b_bdev,
690 bs->bh->b_blocknr);
691 error = ext3_journal_get_write_access(handle, bs->bh);
692 if (error)
693 goto cleanup;
694 lock_buffer(bs->bh);
695
696 if (header(s->base)->h_refcount == cpu_to_le32(1)) {
697 if (ce) {
698 mb_cache_entry_free(ce);
699 ce = NULL;
700 }
701 ea_bdebug(bs->bh, "modifying in-place");
702 error = ext3_xattr_set_entry(i, s);
703 if (!error) {
704 if (!IS_LAST_ENTRY(s->first))
705 ext3_xattr_rehash(header(s->base),
706 s->here);
707 ext3_xattr_cache_insert(bs->bh);
708 }
709 unlock_buffer(bs->bh);
710 if (error == -EIO)
711 goto bad_block;
712 if (!error)
713 error = ext3_journal_dirty_metadata(handle,
714 bs->bh);
715 if (error)
716 goto cleanup;
717 goto inserted;
718 } else {
719 int offset = (char *)s->here - bs->bh->b_data;
720
721 unlock_buffer(bs->bh);
722 journal_release_buffer(handle, bs->bh);
723
724 if (ce) {
725 mb_cache_entry_release(ce);
726 ce = NULL;
727 }
728 ea_bdebug(bs->bh, "cloning");
729 s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
730 error = -ENOMEM;
731 if (s->base == NULL)
732 goto cleanup;
733 memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
734 s->first = ENTRY(header(s->base)+1);
735 header(s->base)->h_refcount = cpu_to_le32(1);
736 s->here = ENTRY(s->base + offset);
737 s->end = s->base + bs->bh->b_size;
738 }
739 } else {
740 /* Allocate a buffer where we construct the new block. */
741 s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
742 /* assert(header == s->base) */
743 error = -ENOMEM;
744 if (s->base == NULL)
745 goto cleanup;
746 header(s->base)->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
747 header(s->base)->h_blocks = cpu_to_le32(1);
748 header(s->base)->h_refcount = cpu_to_le32(1);
749 s->first = ENTRY(header(s->base)+1);
750 s->here = ENTRY(header(s->base)+1);
751 s->end = s->base + sb->s_blocksize;
752 }
753
754 error = ext3_xattr_set_entry(i, s);
755 if (error == -EIO)
756 goto bad_block;
757 if (error)
758 goto cleanup;
759 if (!IS_LAST_ENTRY(s->first))
760 ext3_xattr_rehash(header(s->base), s->here);
761
762inserted:
763 if (!IS_LAST_ENTRY(s->first)) {
764 new_bh = ext3_xattr_cache_find(inode, header(s->base), &ce);
765 if (new_bh) {
766 /* We found an identical block in the cache. */
767 if (new_bh == bs->bh)
768 ea_bdebug(new_bh, "keeping");
769 else {
770 /* The old block is released after updating
771 the inode. */
772 error = dquot_alloc_block(inode, 1);
773 if (error)
774 goto cleanup;
775 error = ext3_journal_get_write_access(handle,
776 new_bh);
777 if (error)
778 goto cleanup_dquot;
779 lock_buffer(new_bh);
780 le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
781 ea_bdebug(new_bh, "reusing; refcount now=%d",
782 le32_to_cpu(BHDR(new_bh)->h_refcount));
783 unlock_buffer(new_bh);
784 error = ext3_journal_dirty_metadata(handle,
785 new_bh);
786 if (error)
787 goto cleanup_dquot;
788 }
789 mb_cache_entry_release(ce);
790 ce = NULL;
791 } else if (bs->bh && s->base == bs->bh->b_data) {
792 /* We were modifying this block in-place. */
793 ea_bdebug(bs->bh, "keeping this block");
794 new_bh = bs->bh;
795 get_bh(new_bh);
796 } else {
797 /* We need to allocate a new block */
798 ext3_fsblk_t goal = ext3_group_first_block_no(sb,
799 EXT3_I(inode)->i_block_group);
800 ext3_fsblk_t block;
801
802 /*
803 * Protect us agaist concurrent allocations to the
804 * same inode from ext3_..._writepage(). Reservation
805 * code does not expect racing allocations.
806 */
807 mutex_lock(&EXT3_I(inode)->truncate_mutex);
808 block = ext3_new_block(handle, inode, goal, &error);
809 mutex_unlock(&EXT3_I(inode)->truncate_mutex);
810 if (error)
811 goto cleanup;
812 ea_idebug(inode, "creating block %d", block);
813
814 new_bh = sb_getblk(sb, block);
815 if (unlikely(!new_bh)) {
816getblk_failed:
817 ext3_free_blocks(handle, inode, block, 1);
818 error = -ENOMEM;
819 goto cleanup;
820 }
821 lock_buffer(new_bh);
822 error = ext3_journal_get_create_access(handle, new_bh);
823 if (error) {
824 unlock_buffer(new_bh);
825 goto getblk_failed;
826 }
827 memcpy(new_bh->b_data, s->base, new_bh->b_size);
828 set_buffer_uptodate(new_bh);
829 unlock_buffer(new_bh);
830 ext3_xattr_cache_insert(new_bh);
831 error = ext3_journal_dirty_metadata(handle, new_bh);
832 if (error)
833 goto cleanup;
834 }
835 }
836
837 /* Update the inode. */
838 EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
839
840 /* Drop the previous xattr block. */
841 if (bs->bh && bs->bh != new_bh)
842 ext3_xattr_release_block(handle, inode, bs->bh);
843 error = 0;
844
845cleanup:
846 if (ce)
847 mb_cache_entry_release(ce);
848 brelse(new_bh);
849 if (!(bs->bh && s->base == bs->bh->b_data))
850 kfree(s->base);
851
852 return error;
853
854cleanup_dquot:
855 dquot_free_block(inode, 1);
856 goto cleanup;
857
858bad_block:
859 ext3_error(inode->i_sb, __func__,
860 "inode %lu: bad block "E3FSBLK, inode->i_ino,
861 EXT3_I(inode)->i_file_acl);
862 goto cleanup;
863
864#undef header
865}
866
867struct ext3_xattr_ibody_find {
868 struct ext3_xattr_search s;
869 struct ext3_iloc iloc;
870};
871
872static int
873ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
874 struct ext3_xattr_ibody_find *is)
875{
876 struct ext3_xattr_ibody_header *header;
877 struct ext3_inode *raw_inode;
878 int error;
879
880 if (EXT3_I(inode)->i_extra_isize == 0)
881 return 0;
882 raw_inode = ext3_raw_inode(&is->iloc);
883 header = IHDR(inode, raw_inode);
884 is->s.base = is->s.first = IFIRST(header);
885 is->s.here = is->s.first;
886 is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
887 if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) {
888 error = ext3_xattr_check_names(IFIRST(header), is->s.end);
889 if (error)
890 return error;
891 /* Find the named attribute. */
892 error = ext3_xattr_find_entry(&is->s.here, i->name_index,
893 i->name, is->s.end -
894 (void *)is->s.base, 0);
895 if (error && error != -ENODATA)
896 return error;
897 is->s.not_found = error;
898 }
899 return 0;
900}
901
902static int
903ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
904 struct ext3_xattr_info *i,
905 struct ext3_xattr_ibody_find *is)
906{
907 struct ext3_xattr_ibody_header *header;
908 struct ext3_xattr_search *s = &is->s;
909 int error;
910
911 if (EXT3_I(inode)->i_extra_isize == 0)
912 return -ENOSPC;
913 error = ext3_xattr_set_entry(i, s);
914 if (error)
915 return error;
916 header = IHDR(inode, ext3_raw_inode(&is->iloc));
917 if (!IS_LAST_ENTRY(s->first)) {
918 header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
919 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
920 } else {
921 header->h_magic = cpu_to_le32(0);
922 ext3_clear_inode_state(inode, EXT3_STATE_XATTR);
923 }
924 return 0;
925}
926
927/*
928 * ext3_xattr_set_handle()
929 *
930 * Create, replace or remove an extended attribute for this inode. Value
931 * is NULL to remove an existing extended attribute, and non-NULL to
932 * either replace an existing extended attribute, or create a new extended
933 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
934 * specify that an extended attribute must exist and must not exist
935 * previous to the call, respectively.
936 *
937 * Returns 0, or a negative error number on failure.
938 */
939int
940ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
941 const char *name, const void *value, size_t value_len,
942 int flags)
943{
944 struct ext3_xattr_info i = {
945 .name_index = name_index,
946 .name = name,
947 .value = value,
948 .value_len = value_len,
949
950 };
951 struct ext3_xattr_ibody_find is = {
952 .s = { .not_found = -ENODATA, },
953 };
954 struct ext3_xattr_block_find bs = {
955 .s = { .not_found = -ENODATA, },
956 };
957 int error;
958
959 if (!name)
960 return -EINVAL;
961 if (strlen(name) > 255)
962 return -ERANGE;
963 down_write(&EXT3_I(inode)->xattr_sem);
964 error = ext3_get_inode_loc(inode, &is.iloc);
965 if (error)
966 goto cleanup;
967
968 error = ext3_journal_get_write_access(handle, is.iloc.bh);
969 if (error)
970 goto cleanup;
971
972 if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) {
973 struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
974 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
975 ext3_clear_inode_state(inode, EXT3_STATE_NEW);
976 }
977
978 error = ext3_xattr_ibody_find(inode, &i, &is);
979 if (error)
980 goto cleanup;
981 if (is.s.not_found)
982 error = ext3_xattr_block_find(inode, &i, &bs);
983 if (error)
984 goto cleanup;
985 if (is.s.not_found && bs.s.not_found) {
986 error = -ENODATA;
987 if (flags & XATTR_REPLACE)
988 goto cleanup;
989 error = 0;
990 if (!value)
991 goto cleanup;
992 } else {
993 error = -EEXIST;
994 if (flags & XATTR_CREATE)
995 goto cleanup;
996 }
997 if (!value) {
998 if (!is.s.not_found)
999 error = ext3_xattr_ibody_set(handle, inode, &i, &is);
1000 else if (!bs.s.not_found)
1001 error = ext3_xattr_block_set(handle, inode, &i, &bs);
1002 } else {
1003 error = ext3_xattr_ibody_set(handle, inode, &i, &is);
1004 if (!error && !bs.s.not_found) {
1005 i.value = NULL;
1006 error = ext3_xattr_block_set(handle, inode, &i, &bs);
1007 } else if (error == -ENOSPC) {
1008 if (EXT3_I(inode)->i_file_acl && !bs.s.base) {
1009 error = ext3_xattr_block_find(inode, &i, &bs);
1010 if (error)
1011 goto cleanup;
1012 }
1013 error = ext3_xattr_block_set(handle, inode, &i, &bs);
1014 if (error)
1015 goto cleanup;
1016 if (!is.s.not_found) {
1017 i.value = NULL;
1018 error = ext3_xattr_ibody_set(handle, inode, &i,
1019 &is);
1020 }
1021 }
1022 }
1023 if (!error) {
1024 ext3_xattr_update_super_block(handle, inode->i_sb);
1025 inode->i_ctime = CURRENT_TIME_SEC;
1026 error = ext3_mark_iloc_dirty(handle, inode, &is.iloc);
1027 /*
1028 * The bh is consumed by ext3_mark_iloc_dirty, even with
1029 * error != 0.
1030 */
1031 is.iloc.bh = NULL;
1032 if (IS_SYNC(inode))
1033 handle->h_sync = 1;
1034 }
1035
1036cleanup:
1037 brelse(is.iloc.bh);
1038 brelse(bs.bh);
1039 up_write(&EXT3_I(inode)->xattr_sem);
1040 return error;
1041}
1042
1043/*
1044 * ext3_xattr_set()
1045 *
1046 * Like ext3_xattr_set_handle, but start from an inode. This extended
1047 * attribute modification is a filesystem transaction by itself.
1048 *
1049 * Returns 0, or a negative error number on failure.
1050 */
1051int
1052ext3_xattr_set(struct inode *inode, int name_index, const char *name,
1053 const void *value, size_t value_len, int flags)
1054{
1055 handle_t *handle;
1056 int error, retries = 0;
1057
1058retry:
1059 handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
1060 if (IS_ERR(handle)) {
1061 error = PTR_ERR(handle);
1062 } else {
1063 int error2;
1064
1065 error = ext3_xattr_set_handle(handle, inode, name_index, name,
1066 value, value_len, flags);
1067 error2 = ext3_journal_stop(handle);
1068 if (error == -ENOSPC &&
1069 ext3_should_retry_alloc(inode->i_sb, &retries))
1070 goto retry;
1071 if (error == 0)
1072 error = error2;
1073 }
1074
1075 return error;
1076}
1077
1078/*
1079 * ext3_xattr_delete_inode()
1080 *
1081 * Free extended attribute resources associated with this inode. This
1082 * is called immediately before an inode is freed. We have exclusive
1083 * access to the inode.
1084 */
1085void
1086ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
1087{
1088 struct buffer_head *bh = NULL;
1089
1090 if (!EXT3_I(inode)->i_file_acl)
1091 goto cleanup;
1092 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
1093 if (!bh) {
1094 ext3_error(inode->i_sb, __func__,
1095 "inode %lu: block "E3FSBLK" read error", inode->i_ino,
1096 EXT3_I(inode)->i_file_acl);
1097 goto cleanup;
1098 }
1099 if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
1100 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1101 ext3_error(inode->i_sb, __func__,
1102 "inode %lu: bad block "E3FSBLK, inode->i_ino,
1103 EXT3_I(inode)->i_file_acl);
1104 goto cleanup;
1105 }
1106 ext3_xattr_release_block(handle, inode, bh);
1107 EXT3_I(inode)->i_file_acl = 0;
1108
1109cleanup:
1110 brelse(bh);
1111}
1112
1113/*
1114 * ext3_xattr_put_super()
1115 *
1116 * This is called when a file system is unmounted.
1117 */
1118void
1119ext3_xattr_put_super(struct super_block *sb)
1120{
1121 mb_cache_shrink(sb->s_bdev);
1122}
1123
1124/*
1125 * ext3_xattr_cache_insert()
1126 *
1127 * Create a new entry in the extended attribute cache, and insert
1128 * it unless such an entry is already in the cache.
1129 *
1130 * Returns 0, or a negative error number on failure.
1131 */
1132static void
1133ext3_xattr_cache_insert(struct buffer_head *bh)
1134{
1135 __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
1136 struct mb_cache_entry *ce;
1137 int error;
1138
1139 ce = mb_cache_entry_alloc(ext3_xattr_cache, GFP_NOFS);
1140 if (!ce) {
1141 ea_bdebug(bh, "out of memory");
1142 return;
1143 }
1144 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
1145 if (error) {
1146 mb_cache_entry_free(ce);
1147 if (error == -EBUSY) {
1148 ea_bdebug(bh, "already in cache");
1149 error = 0;
1150 }
1151 } else {
1152 ea_bdebug(bh, "inserting [%x]", (int)hash);
1153 mb_cache_entry_release(ce);
1154 }
1155}
1156
1157/*
1158 * ext3_xattr_cmp()
1159 *
1160 * Compare two extended attribute blocks for equality.
1161 *
1162 * Returns 0 if the blocks are equal, 1 if they differ, and
1163 * a negative error number on errors.
1164 */
1165static int
1166ext3_xattr_cmp(struct ext3_xattr_header *header1,
1167 struct ext3_xattr_header *header2)
1168{
1169 struct ext3_xattr_entry *entry1, *entry2;
1170
1171 entry1 = ENTRY(header1+1);
1172 entry2 = ENTRY(header2+1);
1173 while (!IS_LAST_ENTRY(entry1)) {
1174 if (IS_LAST_ENTRY(entry2))
1175 return 1;
1176 if (entry1->e_hash != entry2->e_hash ||
1177 entry1->e_name_index != entry2->e_name_index ||
1178 entry1->e_name_len != entry2->e_name_len ||
1179 entry1->e_value_size != entry2->e_value_size ||
1180 memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
1181 return 1;
1182 if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
1183 return -EIO;
1184 if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
1185 (char *)header2 + le16_to_cpu(entry2->e_value_offs),
1186 le32_to_cpu(entry1->e_value_size)))
1187 return 1;
1188
1189 entry1 = EXT3_XATTR_NEXT(entry1);
1190 entry2 = EXT3_XATTR_NEXT(entry2);
1191 }
1192 if (!IS_LAST_ENTRY(entry2))
1193 return 1;
1194 return 0;
1195}
1196
1197/*
1198 * ext3_xattr_cache_find()
1199 *
1200 * Find an identical extended attribute block.
1201 *
1202 * Returns a pointer to the block found, or NULL if such a block was
1203 * not found or an error occurred.
1204 */
1205static struct buffer_head *
1206ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header,
1207 struct mb_cache_entry **pce)
1208{
1209 __u32 hash = le32_to_cpu(header->h_hash);
1210 struct mb_cache_entry *ce;
1211
1212 if (!header->h_hash)
1213 return NULL; /* never share */
1214 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
1215again:
1216 ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev,
1217 hash);
1218 while (ce) {
1219 struct buffer_head *bh;
1220
1221 if (IS_ERR(ce)) {
1222 if (PTR_ERR(ce) == -EAGAIN)
1223 goto again;
1224 break;
1225 }
1226 bh = sb_bread(inode->i_sb, ce->e_block);
1227 if (!bh) {
1228 ext3_error(inode->i_sb, __func__,
1229 "inode %lu: block %lu read error",
1230 inode->i_ino, (unsigned long) ce->e_block);
1231 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
1232 EXT3_XATTR_REFCOUNT_MAX) {
1233 ea_idebug(inode, "block %lu refcount %d>=%d",
1234 (unsigned long) ce->e_block,
1235 le32_to_cpu(BHDR(bh)->h_refcount),
1236 EXT3_XATTR_REFCOUNT_MAX);
1237 } else if (ext3_xattr_cmp(header, BHDR(bh)) == 0) {
1238 *pce = ce;
1239 return bh;
1240 }
1241 brelse(bh);
1242 ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
1243 }
1244 return NULL;
1245}
1246
1247#define NAME_HASH_SHIFT 5
1248#define VALUE_HASH_SHIFT 16
1249
1250/*
1251 * ext3_xattr_hash_entry()
1252 *
1253 * Compute the hash of an extended attribute.
1254 */
1255static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header,
1256 struct ext3_xattr_entry *entry)
1257{
1258 __u32 hash = 0;
1259 char *name = entry->e_name;
1260 int n;
1261
1262 for (n=0; n < entry->e_name_len; n++) {
1263 hash = (hash << NAME_HASH_SHIFT) ^
1264 (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
1265 *name++;
1266 }
1267
1268 if (entry->e_value_block == 0 && entry->e_value_size != 0) {
1269 __le32 *value = (__le32 *)((char *)header +
1270 le16_to_cpu(entry->e_value_offs));
1271 for (n = (le32_to_cpu(entry->e_value_size) +
1272 EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) {
1273 hash = (hash << VALUE_HASH_SHIFT) ^
1274 (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
1275 le32_to_cpu(*value++);
1276 }
1277 }
1278 entry->e_hash = cpu_to_le32(hash);
1279}
1280
1281#undef NAME_HASH_SHIFT
1282#undef VALUE_HASH_SHIFT
1283
1284#define BLOCK_HASH_SHIFT 16
1285
1286/*
1287 * ext3_xattr_rehash()
1288 *
1289 * Re-compute the extended attribute hash value after an entry has changed.
1290 */
1291static void ext3_xattr_rehash(struct ext3_xattr_header *header,
1292 struct ext3_xattr_entry *entry)
1293{
1294 struct ext3_xattr_entry *here;
1295 __u32 hash = 0;
1296
1297 ext3_xattr_hash_entry(header, entry);
1298 here = ENTRY(header+1);
1299 while (!IS_LAST_ENTRY(here)) {
1300 if (!here->e_hash) {
1301 /* Block is not shared if an entry's hash value == 0 */
1302 hash = 0;
1303 break;
1304 }
1305 hash = (hash << BLOCK_HASH_SHIFT) ^
1306 (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
1307 le32_to_cpu(here->e_hash);
1308 here = EXT3_XATTR_NEXT(here);
1309 }
1310 header->h_hash = cpu_to_le32(hash);
1311}
1312
1313#undef BLOCK_HASH_SHIFT
1314
1315int __init
1316init_ext3_xattr(void)
1317{
1318 ext3_xattr_cache = mb_cache_create("ext3_xattr", 6);
1319 if (!ext3_xattr_cache)
1320 return -ENOMEM;
1321 return 0;
1322}
1323
1324void
1325exit_ext3_xattr(void)
1326{
1327 if (ext3_xattr_cache)
1328 mb_cache_destroy(ext3_xattr_cache);
1329 ext3_xattr_cache = NULL;
1330}
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
deleted file mode 100644
index 32e93ebf8031..000000000000
--- a/fs/ext3/xattr.h
+++ /dev/null
@@ -1,136 +0,0 @@
1/*
2 File: fs/ext3/xattr.h
3
4 On-disk format of extended attributes for the ext3 filesystem.
5
6 (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
7*/
8
9#include <linux/xattr.h>
10
11/* Magic value in attribute blocks */
12#define EXT3_XATTR_MAGIC 0xEA020000
13
14/* Maximum number of references to one attribute block */
15#define EXT3_XATTR_REFCOUNT_MAX 1024
16
17/* Name indexes */
18#define EXT3_XATTR_INDEX_USER 1
19#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2
20#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3
21#define EXT3_XATTR_INDEX_TRUSTED 4
22#define EXT3_XATTR_INDEX_LUSTRE 5
23#define EXT3_XATTR_INDEX_SECURITY 6
24
25struct ext3_xattr_header {
26 __le32 h_magic; /* magic number for identification */
27 __le32 h_refcount; /* reference count */
28 __le32 h_blocks; /* number of disk blocks used */
29 __le32 h_hash; /* hash value of all attributes */
30 __u32 h_reserved[4]; /* zero right now */
31};
32
33struct ext3_xattr_ibody_header {
34 __le32 h_magic; /* magic number for identification */
35};
36
37struct ext3_xattr_entry {
38 __u8 e_name_len; /* length of name */
39 __u8 e_name_index; /* attribute name index */
40 __le16 e_value_offs; /* offset in disk block of value */
41 __le32 e_value_block; /* disk block attribute is stored on (n/i) */
42 __le32 e_value_size; /* size of attribute value */
43 __le32 e_hash; /* hash value of name and value */
44 char e_name[0]; /* attribute name */
45};
46
47#define EXT3_XATTR_PAD_BITS 2
48#define EXT3_XATTR_PAD (1<<EXT3_XATTR_PAD_BITS)
49#define EXT3_XATTR_ROUND (EXT3_XATTR_PAD-1)
50#define EXT3_XATTR_LEN(name_len) \
51 (((name_len) + EXT3_XATTR_ROUND + \
52 sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND)
53#define EXT3_XATTR_NEXT(entry) \
54 ( (struct ext3_xattr_entry *)( \
55 (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) )
56#define EXT3_XATTR_SIZE(size) \
57 (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND)
58
59# ifdef CONFIG_EXT3_FS_XATTR
60
61extern const struct xattr_handler ext3_xattr_user_handler;
62extern const struct xattr_handler ext3_xattr_trusted_handler;
63extern const struct xattr_handler ext3_xattr_security_handler;
64
65extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
66
67extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
68extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
69extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
70
71extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
72extern void ext3_xattr_put_super(struct super_block *);
73
74extern int init_ext3_xattr(void);
75extern void exit_ext3_xattr(void);
76
77extern const struct xattr_handler *ext3_xattr_handlers[];
78
79# else /* CONFIG_EXT3_FS_XATTR */
80
81static inline int
82ext3_xattr_get(struct inode *inode, int name_index, const char *name,
83 void *buffer, size_t size, int flags)
84{
85 return -EOPNOTSUPP;
86}
87
88static inline int
89ext3_xattr_set(struct inode *inode, int name_index, const char *name,
90 const void *value, size_t size, int flags)
91{
92 return -EOPNOTSUPP;
93}
94
95static inline int
96ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
97 const char *name, const void *value, size_t size, int flags)
98{
99 return -EOPNOTSUPP;
100}
101
102static inline void
103ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
104{
105}
106
107static inline void
108ext3_xattr_put_super(struct super_block *sb)
109{
110}
111
112static inline int
113init_ext3_xattr(void)
114{
115 return 0;
116}
117
118static inline void
119exit_ext3_xattr(void)
120{
121}
122
123#define ext3_xattr_handlers NULL
124
125# endif /* CONFIG_EXT3_FS_XATTR */
126
127#ifdef CONFIG_EXT3_FS_SECURITY
128extern int ext3_init_security(handle_t *handle, struct inode *inode,
129 struct inode *dir, const struct qstr *qstr);
130#else
131static inline int ext3_init_security(handle_t *handle, struct inode *inode,
132 struct inode *dir, const struct qstr *qstr)
133{
134 return 0;
135}
136#endif
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
deleted file mode 100644
index c9506d5e3b13..000000000000
--- a/fs/ext3/xattr_security.c
+++ /dev/null
@@ -1,78 +0,0 @@
1/*
2 * linux/fs/ext3/xattr_security.c
3 * Handler for storing security labels as extended attributes.
4 */
5
6#include <linux/security.h>
7#include "ext3.h"
8#include "xattr.h"
9
10static size_t
11ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
12 const char *name, size_t name_len, int type)
13{
14 const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
15 const size_t total_len = prefix_len + name_len + 1;
16
17
18 if (list && total_len <= list_size) {
19 memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
20 memcpy(list+prefix_len, name, name_len);
21 list[prefix_len + name_len] = '\0';
22 }
23 return total_len;
24}
25
26static int
27ext3_xattr_security_get(struct dentry *dentry, const char *name,
28 void *buffer, size_t size, int type)
29{
30 if (strcmp(name, "") == 0)
31 return -EINVAL;
32 return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY,
33 name, buffer, size);
34}
35
36static int
37ext3_xattr_security_set(struct dentry *dentry, const char *name,
38 const void *value, size_t size, int flags, int type)
39{
40 if (strcmp(name, "") == 0)
41 return -EINVAL;
42 return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY,
43 name, value, size, flags);
44}
45
46static int ext3_initxattrs(struct inode *inode,
47 const struct xattr *xattr_array,
48 void *fs_info)
49{
50 const struct xattr *xattr;
51 handle_t *handle = fs_info;
52 int err = 0;
53
54 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
55 err = ext3_xattr_set_handle(handle, inode,
56 EXT3_XATTR_INDEX_SECURITY,
57 xattr->name, xattr->value,
58 xattr->value_len, 0);
59 if (err < 0)
60 break;
61 }
62 return err;
63}
64
65int
66ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
67 const struct qstr *qstr)
68{
69 return security_inode_init_security(inode, dir, qstr,
70 &ext3_initxattrs, handle);
71}
72
73const struct xattr_handler ext3_xattr_security_handler = {
74 .prefix = XATTR_SECURITY_PREFIX,
75 .list = ext3_xattr_security_list,
76 .get = ext3_xattr_security_get,
77 .set = ext3_xattr_security_set,
78};
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
deleted file mode 100644
index 206cc66dc285..000000000000
--- a/fs/ext3/xattr_trusted.c
+++ /dev/null
@@ -1,54 +0,0 @@
1/*
2 * linux/fs/ext3/xattr_trusted.c
3 * Handler for trusted extended attributes.
4 *
5 * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */
7
8#include "ext3.h"
9#include "xattr.h"
10
11static size_t
12ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
13 const char *name, size_t name_len, int type)
14{
15 const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
16 const size_t total_len = prefix_len + name_len + 1;
17
18 if (!capable(CAP_SYS_ADMIN))
19 return 0;
20
21 if (list && total_len <= list_size) {
22 memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
23 memcpy(list+prefix_len, name, name_len);
24 list[prefix_len + name_len] = '\0';
25 }
26 return total_len;
27}
28
29static int
30ext3_xattr_trusted_get(struct dentry *dentry, const char *name,
31 void *buffer, size_t size, int type)
32{
33 if (strcmp(name, "") == 0)
34 return -EINVAL;
35 return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED,
36 name, buffer, size);
37}
38
39static int
40ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
41 const void *value, size_t size, int flags, int type)
42{
43 if (strcmp(name, "") == 0)
44 return -EINVAL;
45 return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, name,
46 value, size, flags);
47}
48
49const struct xattr_handler ext3_xattr_trusted_handler = {
50 .prefix = XATTR_TRUSTED_PREFIX,
51 .list = ext3_xattr_trusted_list,
52 .get = ext3_xattr_trusted_get,
53 .set = ext3_xattr_trusted_set,
54};
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
deleted file mode 100644
index 021508ad1616..000000000000
--- a/fs/ext3/xattr_user.c
+++ /dev/null
@@ -1,58 +0,0 @@
1/*
2 * linux/fs/ext3/xattr_user.c
3 * Handler for extended user attributes.
4 *
5 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */
7
8#include "ext3.h"
9#include "xattr.h"
10
11static size_t
12ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
13 const char *name, size_t name_len, int type)
14{
15 const size_t prefix_len = XATTR_USER_PREFIX_LEN;
16 const size_t total_len = prefix_len + name_len + 1;
17
18 if (!test_opt(dentry->d_sb, XATTR_USER))
19 return 0;
20
21 if (list && total_len <= list_size) {
22 memcpy(list, XATTR_USER_PREFIX, prefix_len);
23 memcpy(list+prefix_len, name, name_len);
24 list[prefix_len + name_len] = '\0';
25 }
26 return total_len;
27}
28
29static int
30ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer,
31 size_t size, int type)
32{
33 if (strcmp(name, "") == 0)
34 return -EINVAL;
35 if (!test_opt(dentry->d_sb, XATTR_USER))
36 return -EOPNOTSUPP;
37 return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_USER,
38 name, buffer, size);
39}
40
41static int
42ext3_xattr_user_set(struct dentry *dentry, const char *name,
43 const void *value, size_t size, int flags, int type)
44{
45 if (strcmp(name, "") == 0)
46 return -EINVAL;
47 if (!test_opt(dentry->d_sb, XATTR_USER))
48 return -EOPNOTSUPP;
49 return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_USER,
50 name, value, size, flags);
51}
52
53const struct xattr_handler ext3_xattr_user_handler = {
54 .prefix = XATTR_USER_PREFIX,
55 .list = ext3_xattr_user_list,
56 .get = ext3_xattr_user_get,
57 .set = ext3_xattr_user_set,
58};
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index bf8bc8aba471..47728da7702c 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -1,5 +1,38 @@
1# Ext3 configs are here for backward compatibility with old configs which may
2# have EXT3_FS set but not EXT4_FS set and thus would result in non-bootable
3# kernels after the removal of ext3 driver.
4config EXT3_FS
5 tristate "The Extended 3 (ext3) filesystem"
6 # These must match EXT4_FS selects...
7 select EXT4_FS
8 select JBD2
9 select CRC16
10 select CRYPTO
11 select CRYPTO_CRC32C
12 help
13 This config option is here only for backward compatibility. ext3
14 filesystem is now handled by the ext4 driver.
15
16config EXT3_FS_POSIX_ACL
17 bool "Ext3 POSIX Access Control Lists"
18 depends on EXT3_FS
19 select EXT4_FS_POSIX_ACL
20 select FS_POSIX_ACL
21 help
22 This config option is here only for backward compatibility. ext3
23 filesystem is now handled by the ext4 driver.
24
25config EXT3_FS_SECURITY
26 bool "Ext3 Security Labels"
27 depends on EXT3_FS
28 select EXT4_FS_SECURITY
29 help
30 This config option is here only for backward compatibility. ext3
31 filesystem is now handled by the ext4 driver.
32
1config EXT4_FS 33config EXT4_FS
2 tristate "The Extended 4 (ext4) filesystem" 34 tristate "The Extended 4 (ext4) filesystem"
35 # Please update EXT3_FS selects when changing these
3 select JBD2 36 select JBD2
4 select CRC16 37 select CRC16
5 select CRYPTO 38 select CRYPTO
@@ -16,26 +49,27 @@ config EXT4_FS
16 up fsck time. For more information, please see the web pages at 49 up fsck time. For more information, please see the web pages at
17 http://ext4.wiki.kernel.org. 50 http://ext4.wiki.kernel.org.
18 51
19 The ext4 filesystem will support mounting an ext3 52 The ext4 filesystem supports mounting an ext3 filesystem; while there
20 filesystem; while there will be some performance gains from 53 are some performance gains from the delayed allocation and inode
21 the delayed allocation and inode table readahead, the best 54 table readahead, the best performance gains require enabling ext4
22 performance gains will require enabling ext4 features in the 55 features in the filesystem using tune2fs, or formatting a new
23 filesystem, or formatting a new filesystem as an ext4 56 filesystem as an ext4 filesystem initially. Without explicit enabling
24 filesystem initially. 57 of ext4 features, the on disk filesystem format stays fully backward
58 compatible.
25 59
26 To compile this file system support as a module, choose M here. The 60 To compile this file system support as a module, choose M here. The
27 module will be called ext4. 61 module will be called ext4.
28 62
29 If unsure, say N. 63 If unsure, say N.
30 64
31config EXT4_USE_FOR_EXT23 65config EXT4_USE_FOR_EXT2
32 bool "Use ext4 for ext2/ext3 file systems" 66 bool "Use ext4 for ext2/ext3 file systems"
33 depends on EXT4_FS 67 depends on EXT4_FS
34 depends on EXT3_FS=n || EXT2_FS=n 68 depends on EXT2_FS=n
35 default y 69 default y
36 help 70 help
37 Allow the ext4 file system driver code to be used for ext2 or 71 Allow the ext4 file system driver code to be used for ext2
38 ext3 file system mounts. This allows users to reduce their 72 file system mounts. This allows users to reduce their
39 compiled kernel size by using one file system driver for 73 compiled kernel size by using one file system driver for
40 ext2, ext3, and ext4 file systems. 74 ext2, ext3, and ext4 file systems.
41 75
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 173c1ae21395..619bfc1fda8c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -721,7 +721,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
721 struct ext4_group_desc *gdp = NULL; 721 struct ext4_group_desc *gdp = NULL;
722 struct ext4_inode_info *ei; 722 struct ext4_inode_info *ei;
723 struct ext4_sb_info *sbi; 723 struct ext4_sb_info *sbi;
724 int ret2, err = 0; 724 int ret2, err;
725 struct inode *ret; 725 struct inode *ret;
726 ext4_group_t i; 726 ext4_group_t i;
727 ext4_group_t flex_group; 727 ext4_group_t flex_group;
@@ -769,7 +769,9 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
769 inode->i_gid = dir->i_gid; 769 inode->i_gid = dir->i_gid;
770 } else 770 } else
771 inode_init_owner(inode, dir, mode); 771 inode_init_owner(inode, dir, mode);
772 dquot_initialize(inode); 772 err = dquot_initialize(inode);
773 if (err)
774 goto out;
773 775
774 if (!goal) 776 if (!goal)
775 goal = sbi->s_inode_goal; 777 goal = sbi->s_inode_goal;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cecf9aa10811..fed7ee7ea6e8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4661,8 +4661,11 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4661 if (error) 4661 if (error)
4662 return error; 4662 return error;
4663 4663
4664 if (is_quota_modification(inode, attr)) 4664 if (is_quota_modification(inode, attr)) {
4665 dquot_initialize(inode); 4665 error = dquot_initialize(inode);
4666 if (error)
4667 return error;
4668 }
4666 if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || 4669 if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
4667 (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { 4670 (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
4668 handle_t *handle; 4671 handle_t *handle;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 011dcfb5cce3..9f61e7679a6d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2436,7 +2436,9 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2436 struct inode *inode; 2436 struct inode *inode;
2437 int err, credits, retries = 0; 2437 int err, credits, retries = 0;
2438 2438
2439 dquot_initialize(dir); 2439 err = dquot_initialize(dir);
2440 if (err)
2441 return err;
2440 2442
2441 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2443 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2442 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); 2444 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
@@ -2470,7 +2472,9 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
2470 if (!new_valid_dev(rdev)) 2472 if (!new_valid_dev(rdev))
2471 return -EINVAL; 2473 return -EINVAL;
2472 2474
2473 dquot_initialize(dir); 2475 err = dquot_initialize(dir);
2476 if (err)
2477 return err;
2474 2478
2475 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2479 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2476 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); 2480 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
@@ -2499,7 +2503,9 @@ static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
2499 struct inode *inode; 2503 struct inode *inode;
2500 int err, retries = 0; 2504 int err, retries = 0;
2501 2505
2502 dquot_initialize(dir); 2506 err = dquot_initialize(dir);
2507 if (err)
2508 return err;
2503 2509
2504retry: 2510retry:
2505 inode = ext4_new_inode_start_handle(dir, mode, 2511 inode = ext4_new_inode_start_handle(dir, mode,
@@ -2612,7 +2618,9 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
2612 if (EXT4_DIR_LINK_MAX(dir)) 2618 if (EXT4_DIR_LINK_MAX(dir))
2613 return -EMLINK; 2619 return -EMLINK;
2614 2620
2615 dquot_initialize(dir); 2621 err = dquot_initialize(dir);
2622 if (err)
2623 return err;
2616 2624
2617 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2625 credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2618 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); 2626 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
@@ -2910,8 +2918,12 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2910 2918
2911 /* Initialize quotas before so that eventual writes go in 2919 /* Initialize quotas before so that eventual writes go in
2912 * separate transaction */ 2920 * separate transaction */
2913 dquot_initialize(dir); 2921 retval = dquot_initialize(dir);
2914 dquot_initialize(d_inode(dentry)); 2922 if (retval)
2923 return retval;
2924 retval = dquot_initialize(d_inode(dentry));
2925 if (retval)
2926 return retval;
2915 2927
2916 retval = -ENOENT; 2928 retval = -ENOENT;
2917 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); 2929 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
@@ -2980,8 +2992,12 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2980 trace_ext4_unlink_enter(dir, dentry); 2992 trace_ext4_unlink_enter(dir, dentry);
2981 /* Initialize quotas before so that eventual writes go 2993 /* Initialize quotas before so that eventual writes go
2982 * in separate transaction */ 2994 * in separate transaction */
2983 dquot_initialize(dir); 2995 retval = dquot_initialize(dir);
2984 dquot_initialize(d_inode(dentry)); 2996 if (retval)
2997 return retval;
2998 retval = dquot_initialize(d_inode(dentry));
2999 if (retval)
3000 return retval;
2985 3001
2986 retval = -ENOENT; 3002 retval = -ENOENT;
2987 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); 3003 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
@@ -3066,7 +3082,9 @@ static int ext4_symlink(struct inode *dir,
3066 goto err_free_sd; 3082 goto err_free_sd;
3067 } 3083 }
3068 3084
3069 dquot_initialize(dir); 3085 err = dquot_initialize(dir);
3086 if (err)
3087 goto err_free_sd;
3070 3088
3071 if ((disk_link.len > EXT4_N_BLOCKS * 4)) { 3089 if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
3072 /* 3090 /*
@@ -3197,7 +3215,9 @@ static int ext4_link(struct dentry *old_dentry,
3197 if (ext4_encrypted_inode(dir) && 3215 if (ext4_encrypted_inode(dir) &&
3198 !ext4_is_child_context_consistent_with_parent(dir, inode)) 3216 !ext4_is_child_context_consistent_with_parent(dir, inode))
3199 return -EPERM; 3217 return -EPERM;
3200 dquot_initialize(dir); 3218 err = dquot_initialize(dir);
3219 if (err)
3220 return err;
3201 3221
3202retry: 3222retry:
3203 handle = ext4_journal_start(dir, EXT4_HT_DIR, 3223 handle = ext4_journal_start(dir, EXT4_HT_DIR,
@@ -3476,13 +3496,20 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3476 int credits; 3496 int credits;
3477 u8 old_file_type; 3497 u8 old_file_type;
3478 3498
3479 dquot_initialize(old.dir); 3499 retval = dquot_initialize(old.dir);
3480 dquot_initialize(new.dir); 3500 if (retval)
3501 return retval;
3502 retval = dquot_initialize(new.dir);
3503 if (retval)
3504 return retval;
3481 3505
3482 /* Initialize quotas before so that eventual writes go 3506 /* Initialize quotas before so that eventual writes go
3483 * in separate transaction */ 3507 * in separate transaction */
3484 if (new.inode) 3508 if (new.inode) {
3485 dquot_initialize(new.inode); 3509 retval = dquot_initialize(new.inode);
3510 if (retval)
3511 return retval;
3512 }
3486 3513
3487 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); 3514 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
3488 if (IS_ERR(old.bh)) 3515 if (IS_ERR(old.bh))
@@ -3678,8 +3705,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
3678 new.inode))) 3705 new.inode)))
3679 return -EPERM; 3706 return -EPERM;
3680 3707
3681 dquot_initialize(old.dir); 3708 retval = dquot_initialize(old.dir);
3682 dquot_initialize(new.dir); 3709 if (retval)
3710 return retval;
3711 retval = dquot_initialize(new.dir);
3712 if (retval)
3713 return retval;
3683 3714
3684 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, 3715 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
3685 &old.de, &old.inlined); 3716 &old.de, &old.inlined);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 58987b5c514b..06b4b14e8aa0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -84,7 +84,7 @@ static void ext4_unregister_li_request(struct super_block *sb);
84static void ext4_clear_request_list(void); 84static void ext4_clear_request_list(void);
85static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t); 85static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
86 86
87#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 87#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
88static struct file_system_type ext2_fs_type = { 88static struct file_system_type ext2_fs_type = {
89 .owner = THIS_MODULE, 89 .owner = THIS_MODULE,
90 .name = "ext2", 90 .name = "ext2",
@@ -100,7 +100,6 @@ MODULE_ALIAS("ext2");
100#endif 100#endif
101 101
102 102
103#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
104static struct file_system_type ext3_fs_type = { 103static struct file_system_type ext3_fs_type = {
105 .owner = THIS_MODULE, 104 .owner = THIS_MODULE,
106 .name = "ext3", 105 .name = "ext3",
@@ -111,9 +110,6 @@ static struct file_system_type ext3_fs_type = {
111MODULE_ALIAS_FS("ext3"); 110MODULE_ALIAS_FS("ext3");
112MODULE_ALIAS("ext3"); 111MODULE_ALIAS("ext3");
113#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) 112#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
114#else
115#define IS_EXT3_SB(sb) (0)
116#endif
117 113
118static int ext4_verify_csum_type(struct super_block *sb, 114static int ext4_verify_csum_type(struct super_block *sb,
119 struct ext4_super_block *es) 115 struct ext4_super_block *es)
@@ -5500,7 +5496,7 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
5500 return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super); 5496 return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
5501} 5497}
5502 5498
5503#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 5499#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
5504static inline void register_as_ext2(void) 5500static inline void register_as_ext2(void)
5505{ 5501{
5506 int err = register_filesystem(&ext2_fs_type); 5502 int err = register_filesystem(&ext2_fs_type);
@@ -5530,7 +5526,6 @@ static inline void unregister_as_ext2(void) { }
5530static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } 5526static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
5531#endif 5527#endif
5532 5528
5533#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
5534static inline void register_as_ext3(void) 5529static inline void register_as_ext3(void)
5535{ 5530{
5536 int err = register_filesystem(&ext3_fs_type); 5531 int err = register_filesystem(&ext3_fs_type);
@@ -5556,11 +5551,6 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
5556 return 0; 5551 return 0;
5557 return 1; 5552 return 1;
5558} 5553}
5559#else
5560static inline void register_as_ext3(void) { }
5561static inline void unregister_as_ext3(void) { }
5562static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
5563#endif
5564 5554
5565static struct file_system_type ext4_fs_type = { 5555static struct file_system_type ext4_fs_type = {
5566 .owner = THIS_MODULE, 5556 .owner = THIS_MODULE,
diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig
deleted file mode 100644
index 4e28beeed157..000000000000
--- a/fs/jbd/Kconfig
+++ /dev/null
@@ -1,30 +0,0 @@
1config JBD
2 tristate
3 help
4 This is a generic journalling layer for block devices. It is
5 currently used by the ext3 file system, but it could also be
6 used to add journal support to other file systems or block
7 devices such as RAID or LVM.
8
9 If you are using the ext3 file system, you need to say Y here.
10 If you are not using ext3 then you will probably want to say N.
11
12 To compile this device as a module, choose M here: the module will be
13 called jbd. If you are compiling ext3 into the kernel, you
14 cannot compile this code as a module.
15
16config JBD_DEBUG
17 bool "JBD (ext3) debugging support"
18 depends on JBD && DEBUG_FS
19 help
20 If you are using the ext3 journaled file system (or potentially any
21 other file system/device using JBD), this option allows you to
22 enable debugging output while the system is running, in order to
23 help track down any problems you are having. By default the
24 debugging output will be turned off.
25
26 If you select Y here, then you will be able to turn on debugging
27 with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
28 number between 1 and 5, the higher the number, the more debugging
29 output is generated. To turn debugging off again, do
30 "echo 0 > /sys/kernel/debug/jbd/jbd-debug".
diff --git a/fs/jbd/Makefile b/fs/jbd/Makefile
deleted file mode 100644
index 54aca4868a36..000000000000
--- a/fs/jbd/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
1#
2# Makefile for the linux journaling routines.
3#
4
5obj-$(CONFIG_JBD) += jbd.o
6
7jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
deleted file mode 100644
index 08c03044abdd..000000000000
--- a/fs/jbd/checkpoint.c
+++ /dev/null
@@ -1,782 +0,0 @@
1/*
2 * linux/fs/jbd/checkpoint.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 *
6 * Copyright 1999 Red Hat Software --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Checkpoint routines for the generic filesystem journaling code.
13 * Part of the ext2fs journaling system.
14 *
15 * Checkpointing is the process of ensuring that a section of the log is
16 * committed fully to disk, so that that portion of the log can be
17 * reused.
18 */
19
20#include <linux/time.h>
21#include <linux/fs.h>
22#include <linux/jbd.h>
23#include <linux/errno.h>
24#include <linux/slab.h>
25#include <linux/blkdev.h>
26#include <trace/events/jbd.h>
27
28/*
29 * Unlink a buffer from a transaction checkpoint list.
30 *
31 * Called with j_list_lock held.
32 */
33static inline void __buffer_unlink_first(struct journal_head *jh)
34{
35 transaction_t *transaction = jh->b_cp_transaction;
36
37 jh->b_cpnext->b_cpprev = jh->b_cpprev;
38 jh->b_cpprev->b_cpnext = jh->b_cpnext;
39 if (transaction->t_checkpoint_list == jh) {
40 transaction->t_checkpoint_list = jh->b_cpnext;
41 if (transaction->t_checkpoint_list == jh)
42 transaction->t_checkpoint_list = NULL;
43 }
44}
45
46/*
47 * Unlink a buffer from a transaction checkpoint(io) list.
48 *
49 * Called with j_list_lock held.
50 */
51static inline void __buffer_unlink(struct journal_head *jh)
52{
53 transaction_t *transaction = jh->b_cp_transaction;
54
55 __buffer_unlink_first(jh);
56 if (transaction->t_checkpoint_io_list == jh) {
57 transaction->t_checkpoint_io_list = jh->b_cpnext;
58 if (transaction->t_checkpoint_io_list == jh)
59 transaction->t_checkpoint_io_list = NULL;
60 }
61}
62
63/*
64 * Move a buffer from the checkpoint list to the checkpoint io list
65 *
66 * Called with j_list_lock held
67 */
68static inline void __buffer_relink_io(struct journal_head *jh)
69{
70 transaction_t *transaction = jh->b_cp_transaction;
71
72 __buffer_unlink_first(jh);
73
74 if (!transaction->t_checkpoint_io_list) {
75 jh->b_cpnext = jh->b_cpprev = jh;
76 } else {
77 jh->b_cpnext = transaction->t_checkpoint_io_list;
78 jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
79 jh->b_cpprev->b_cpnext = jh;
80 jh->b_cpnext->b_cpprev = jh;
81 }
82 transaction->t_checkpoint_io_list = jh;
83}
84
85/*
86 * Try to release a checkpointed buffer from its transaction.
87 * Returns 1 if we released it and 2 if we also released the
88 * whole transaction.
89 *
90 * Requires j_list_lock
91 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
92 */
93static int __try_to_free_cp_buf(struct journal_head *jh)
94{
95 int ret = 0;
96 struct buffer_head *bh = jh2bh(jh);
97
98 if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
99 !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
100 /*
101 * Get our reference so that bh cannot be freed before
102 * we unlock it
103 */
104 get_bh(bh);
105 JBUFFER_TRACE(jh, "remove from checkpoint list");
106 ret = __journal_remove_checkpoint(jh) + 1;
107 jbd_unlock_bh_state(bh);
108 BUFFER_TRACE(bh, "release");
109 __brelse(bh);
110 } else {
111 jbd_unlock_bh_state(bh);
112 }
113 return ret;
114}
115
116/*
117 * __log_wait_for_space: wait until there is space in the journal.
118 *
119 * Called under j-state_lock *only*. It will be unlocked if we have to wait
120 * for a checkpoint to free up some space in the log.
121 */
122void __log_wait_for_space(journal_t *journal)
123{
124 int nblocks, space_left;
125 assert_spin_locked(&journal->j_state_lock);
126
127 nblocks = jbd_space_needed(journal);
128 while (__log_space_left(journal) < nblocks) {
129 if (journal->j_flags & JFS_ABORT)
130 return;
131 spin_unlock(&journal->j_state_lock);
132 mutex_lock(&journal->j_checkpoint_mutex);
133
134 /*
135 * Test again, another process may have checkpointed while we
136 * were waiting for the checkpoint lock. If there are no
137 * transactions ready to be checkpointed, try to recover
138 * journal space by calling cleanup_journal_tail(), and if
139 * that doesn't work, by waiting for the currently committing
140 * transaction to complete. If there is absolutely no way
141 * to make progress, this is either a BUG or corrupted
142 * filesystem, so abort the journal and leave a stack
143 * trace for forensic evidence.
144 */
145 spin_lock(&journal->j_state_lock);
146 spin_lock(&journal->j_list_lock);
147 nblocks = jbd_space_needed(journal);
148 space_left = __log_space_left(journal);
149 if (space_left < nblocks) {
150 int chkpt = journal->j_checkpoint_transactions != NULL;
151 tid_t tid = 0;
152
153 if (journal->j_committing_transaction)
154 tid = journal->j_committing_transaction->t_tid;
155 spin_unlock(&journal->j_list_lock);
156 spin_unlock(&journal->j_state_lock);
157 if (chkpt) {
158 log_do_checkpoint(journal);
159 } else if (cleanup_journal_tail(journal) == 0) {
160 /* We were able to recover space; yay! */
161 ;
162 } else if (tid) {
163 log_wait_commit(journal, tid);
164 } else {
165 printk(KERN_ERR "%s: needed %d blocks and "
166 "only had %d space available\n",
167 __func__, nblocks, space_left);
168 printk(KERN_ERR "%s: no way to get more "
169 "journal space\n", __func__);
170 WARN_ON(1);
171 journal_abort(journal, 0);
172 }
173 spin_lock(&journal->j_state_lock);
174 } else {
175 spin_unlock(&journal->j_list_lock);
176 }
177 mutex_unlock(&journal->j_checkpoint_mutex);
178 }
179}
180
181/*
182 * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
183 * The caller must restart a list walk. Wait for someone else to run
184 * jbd_unlock_bh_state().
185 */
186static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
187 __releases(journal->j_list_lock)
188{
189 get_bh(bh);
190 spin_unlock(&journal->j_list_lock);
191 jbd_lock_bh_state(bh);
192 jbd_unlock_bh_state(bh);
193 put_bh(bh);
194}
195
196/*
197 * Clean up transaction's list of buffers submitted for io.
198 * We wait for any pending IO to complete and remove any clean
199 * buffers. Note that we take the buffers in the opposite ordering
200 * from the one in which they were submitted for IO.
201 *
202 * Return 0 on success, and return <0 if some buffers have failed
203 * to be written out.
204 *
205 * Called with j_list_lock held.
206 */
207static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
208{
209 struct journal_head *jh;
210 struct buffer_head *bh;
211 tid_t this_tid;
212 int released = 0;
213 int ret = 0;
214
215 this_tid = transaction->t_tid;
216restart:
217 /* Did somebody clean up the transaction in the meanwhile? */
218 if (journal->j_checkpoint_transactions != transaction ||
219 transaction->t_tid != this_tid)
220 return ret;
221 while (!released && transaction->t_checkpoint_io_list) {
222 jh = transaction->t_checkpoint_io_list;
223 bh = jh2bh(jh);
224 if (!jbd_trylock_bh_state(bh)) {
225 jbd_sync_bh(journal, bh);
226 spin_lock(&journal->j_list_lock);
227 goto restart;
228 }
229 get_bh(bh);
230 if (buffer_locked(bh)) {
231 spin_unlock(&journal->j_list_lock);
232 jbd_unlock_bh_state(bh);
233 wait_on_buffer(bh);
234 /* the journal_head may have gone by now */
235 BUFFER_TRACE(bh, "brelse");
236 __brelse(bh);
237 spin_lock(&journal->j_list_lock);
238 goto restart;
239 }
240 if (unlikely(buffer_write_io_error(bh)))
241 ret = -EIO;
242
243 /*
244 * Now in whatever state the buffer currently is, we know that
245 * it has been written out and so we can drop it from the list
246 */
247 released = __journal_remove_checkpoint(jh);
248 jbd_unlock_bh_state(bh);
249 __brelse(bh);
250 }
251
252 return ret;
253}
254
255#define NR_BATCH 64
256
257static void
258__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
259{
260 int i;
261 struct blk_plug plug;
262
263 blk_start_plug(&plug);
264 for (i = 0; i < *batch_count; i++)
265 write_dirty_buffer(bhs[i], WRITE_SYNC);
266 blk_finish_plug(&plug);
267
268 for (i = 0; i < *batch_count; i++) {
269 struct buffer_head *bh = bhs[i];
270 clear_buffer_jwrite(bh);
271 BUFFER_TRACE(bh, "brelse");
272 __brelse(bh);
273 }
274 *batch_count = 0;
275}
276
277/*
278 * Try to flush one buffer from the checkpoint list to disk.
279 *
280 * Return 1 if something happened which requires us to abort the current
281 * scan of the checkpoint list. Return <0 if the buffer has failed to
282 * be written out.
283 *
284 * Called with j_list_lock held and drops it if 1 is returned
285 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
286 */
287static int __process_buffer(journal_t *journal, struct journal_head *jh,
288 struct buffer_head **bhs, int *batch_count)
289{
290 struct buffer_head *bh = jh2bh(jh);
291 int ret = 0;
292
293 if (buffer_locked(bh)) {
294 get_bh(bh);
295 spin_unlock(&journal->j_list_lock);
296 jbd_unlock_bh_state(bh);
297 wait_on_buffer(bh);
298 /* the journal_head may have gone by now */
299 BUFFER_TRACE(bh, "brelse");
300 __brelse(bh);
301 ret = 1;
302 } else if (jh->b_transaction != NULL) {
303 transaction_t *t = jh->b_transaction;
304 tid_t tid = t->t_tid;
305
306 spin_unlock(&journal->j_list_lock);
307 jbd_unlock_bh_state(bh);
308 log_start_commit(journal, tid);
309 log_wait_commit(journal, tid);
310 ret = 1;
311 } else if (!buffer_dirty(bh)) {
312 ret = 1;
313 if (unlikely(buffer_write_io_error(bh)))
314 ret = -EIO;
315 get_bh(bh);
316 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
317 BUFFER_TRACE(bh, "remove from checkpoint");
318 __journal_remove_checkpoint(jh);
319 spin_unlock(&journal->j_list_lock);
320 jbd_unlock_bh_state(bh);
321 __brelse(bh);
322 } else {
323 /*
324 * Important: we are about to write the buffer, and
325 * possibly block, while still holding the journal lock.
326 * We cannot afford to let the transaction logic start
327 * messing around with this buffer before we write it to
328 * disk, as that would break recoverability.
329 */
330 BUFFER_TRACE(bh, "queue");
331 get_bh(bh);
332 J_ASSERT_BH(bh, !buffer_jwrite(bh));
333 set_buffer_jwrite(bh);
334 bhs[*batch_count] = bh;
335 __buffer_relink_io(jh);
336 jbd_unlock_bh_state(bh);
337 (*batch_count)++;
338 if (*batch_count == NR_BATCH) {
339 spin_unlock(&journal->j_list_lock);
340 __flush_batch(journal, bhs, batch_count);
341 ret = 1;
342 }
343 }
344 return ret;
345}
346
347/*
348 * Perform an actual checkpoint. We take the first transaction on the
349 * list of transactions to be checkpointed and send all its buffers
350 * to disk. We submit larger chunks of data at once.
351 *
352 * The journal should be locked before calling this function.
353 * Called with j_checkpoint_mutex held.
354 */
355int log_do_checkpoint(journal_t *journal)
356{
357 transaction_t *transaction;
358 tid_t this_tid;
359 int result;
360
361 jbd_debug(1, "Start checkpoint\n");
362
363 /*
364 * First thing: if there are any transactions in the log which
365 * don't need checkpointing, just eliminate them from the
366 * journal straight away.
367 */
368 result = cleanup_journal_tail(journal);
369 trace_jbd_checkpoint(journal, result);
370 jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
371 if (result <= 0)
372 return result;
373
374 /*
375 * OK, we need to start writing disk blocks. Take one transaction
376 * and write it.
377 */
378 result = 0;
379 spin_lock(&journal->j_list_lock);
380 if (!journal->j_checkpoint_transactions)
381 goto out;
382 transaction = journal->j_checkpoint_transactions;
383 this_tid = transaction->t_tid;
384restart:
385 /*
386 * If someone cleaned up this transaction while we slept, we're
387 * done (maybe it's a new transaction, but it fell at the same
388 * address).
389 */
390 if (journal->j_checkpoint_transactions == transaction &&
391 transaction->t_tid == this_tid) {
392 int batch_count = 0;
393 struct buffer_head *bhs[NR_BATCH];
394 struct journal_head *jh;
395 int retry = 0, err;
396
397 while (!retry && transaction->t_checkpoint_list) {
398 struct buffer_head *bh;
399
400 jh = transaction->t_checkpoint_list;
401 bh = jh2bh(jh);
402 if (!jbd_trylock_bh_state(bh)) {
403 jbd_sync_bh(journal, bh);
404 retry = 1;
405 break;
406 }
407 retry = __process_buffer(journal, jh, bhs,&batch_count);
408 if (retry < 0 && !result)
409 result = retry;
410 if (!retry && (need_resched() ||
411 spin_needbreak(&journal->j_list_lock))) {
412 spin_unlock(&journal->j_list_lock);
413 retry = 1;
414 break;
415 }
416 }
417
418 if (batch_count) {
419 if (!retry) {
420 spin_unlock(&journal->j_list_lock);
421 retry = 1;
422 }
423 __flush_batch(journal, bhs, &batch_count);
424 }
425
426 if (retry) {
427 spin_lock(&journal->j_list_lock);
428 goto restart;
429 }
430 /*
431 * Now we have cleaned up the first transaction's checkpoint
432 * list. Let's clean up the second one
433 */
434 err = __wait_cp_io(journal, transaction);
435 if (!result)
436 result = err;
437 }
438out:
439 spin_unlock(&journal->j_list_lock);
440 if (result < 0)
441 journal_abort(journal, result);
442 else
443 result = cleanup_journal_tail(journal);
444
445 return (result < 0) ? result : 0;
446}
447
448/*
449 * Check the list of checkpoint transactions for the journal to see if
450 * we have already got rid of any since the last update of the log tail
451 * in the journal superblock. If so, we can instantly roll the
452 * superblock forward to remove those transactions from the log.
453 *
454 * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
455 *
456 * This is the only part of the journaling code which really needs to be
457 * aware of transaction aborts. Checkpointing involves writing to the
458 * main filesystem area rather than to the journal, so it can proceed
459 * even in abort state, but we must not update the super block if
460 * checkpointing may have failed. Otherwise, we would lose some metadata
461 * buffers which should be written-back to the filesystem.
462 */
463
464int cleanup_journal_tail(journal_t *journal)
465{
466 transaction_t * transaction;
467 tid_t first_tid;
468 unsigned int blocknr, freed;
469
470 if (is_journal_aborted(journal))
471 return 1;
472
473 /*
474 * OK, work out the oldest transaction remaining in the log, and
475 * the log block it starts at.
476 *
477 * If the log is now empty, we need to work out which is the
478 * next transaction ID we will write, and where it will
479 * start.
480 */
481 spin_lock(&journal->j_state_lock);
482 spin_lock(&journal->j_list_lock);
483 transaction = journal->j_checkpoint_transactions;
484 if (transaction) {
485 first_tid = transaction->t_tid;
486 blocknr = transaction->t_log_start;
487 } else if ((transaction = journal->j_committing_transaction) != NULL) {
488 first_tid = transaction->t_tid;
489 blocknr = transaction->t_log_start;
490 } else if ((transaction = journal->j_running_transaction) != NULL) {
491 first_tid = transaction->t_tid;
492 blocknr = journal->j_head;
493 } else {
494 first_tid = journal->j_transaction_sequence;
495 blocknr = journal->j_head;
496 }
497 spin_unlock(&journal->j_list_lock);
498 J_ASSERT(blocknr != 0);
499
500 /* If the oldest pinned transaction is at the tail of the log
501 already then there's not much we can do right now. */
502 if (journal->j_tail_sequence == first_tid) {
503 spin_unlock(&journal->j_state_lock);
504 return 1;
505 }
506 spin_unlock(&journal->j_state_lock);
507
508 /*
509 * We need to make sure that any blocks that were recently written out
510 * --- perhaps by log_do_checkpoint() --- are flushed out before we
511 * drop the transactions from the journal. Similarly we need to be sure
512 * superblock makes it to disk before next transaction starts reusing
513 * freed space (otherwise we could replay some blocks of the new
514 * transaction thinking they belong to the old one). So we use
515 * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially
516 * with an appropriately sized journal, but we need this to guarantee
517 * correctness. Fortunately cleanup_journal_tail() doesn't get called
518 * all that often.
519 */
520 journal_update_sb_log_tail(journal, first_tid, blocknr,
521 WRITE_FLUSH_FUA);
522
523 spin_lock(&journal->j_state_lock);
524 /* OK, update the superblock to recover the freed space.
525 * Physical blocks come first: have we wrapped beyond the end of
526 * the log? */
527 freed = blocknr - journal->j_tail;
528 if (blocknr < journal->j_tail)
529 freed = freed + journal->j_last - journal->j_first;
530
531 trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed);
532 jbd_debug(1,
533 "Cleaning journal tail from %d to %d (offset %u), "
534 "freeing %u\n",
535 journal->j_tail_sequence, first_tid, blocknr, freed);
536
537 journal->j_free += freed;
538 journal->j_tail_sequence = first_tid;
539 journal->j_tail = blocknr;
540 spin_unlock(&journal->j_state_lock);
541 return 0;
542}
543
544
545/* Checkpoint list management */
546
547/*
548 * journal_clean_one_cp_list
549 *
550 * Find all the written-back checkpoint buffers in the given list and release
551 * them.
552 *
553 * Called with j_list_lock held.
554 * Returns number of buffers reaped (for debug)
555 */
556
557static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
558{
559 struct journal_head *last_jh;
560 struct journal_head *next_jh = jh;
561 int ret, freed = 0;
562
563 *released = 0;
564 if (!jh)
565 return 0;
566
567 last_jh = jh->b_cpprev;
568 do {
569 jh = next_jh;
570 next_jh = jh->b_cpnext;
571 /* Use trylock because of the ranking */
572 if (jbd_trylock_bh_state(jh2bh(jh))) {
573 ret = __try_to_free_cp_buf(jh);
574 if (ret) {
575 freed++;
576 if (ret == 2) {
577 *released = 1;
578 return freed;
579 }
580 }
581 }
582 /*
583 * This function only frees up some memory
584 * if possible so we dont have an obligation
585 * to finish processing. Bail out if preemption
586 * requested:
587 */
588 if (need_resched())
589 return freed;
590 } while (jh != last_jh);
591
592 return freed;
593}
594
595/*
596 * journal_clean_checkpoint_list
597 *
598 * Find all the written-back checkpoint buffers in the journal and release them.
599 *
600 * Called with the journal locked.
601 * Called with j_list_lock held.
602 * Returns number of buffers reaped (for debug)
603 */
604
605int __journal_clean_checkpoint_list(journal_t *journal)
606{
607 transaction_t *transaction, *last_transaction, *next_transaction;
608 int ret = 0;
609 int released;
610
611 transaction = journal->j_checkpoint_transactions;
612 if (!transaction)
613 goto out;
614
615 last_transaction = transaction->t_cpprev;
616 next_transaction = transaction;
617 do {
618 transaction = next_transaction;
619 next_transaction = transaction->t_cpnext;
620 ret += journal_clean_one_cp_list(transaction->
621 t_checkpoint_list, &released);
622 /*
623 * This function only frees up some memory if possible so we
624 * dont have an obligation to finish processing. Bail out if
625 * preemption requested:
626 */
627 if (need_resched())
628 goto out;
629 if (released)
630 continue;
631 /*
632 * It is essential that we are as careful as in the case of
633 * t_checkpoint_list with removing the buffer from the list as
634 * we can possibly see not yet submitted buffers on io_list
635 */
636 ret += journal_clean_one_cp_list(transaction->
637 t_checkpoint_io_list, &released);
638 if (need_resched())
639 goto out;
640 } while (transaction != last_transaction);
641out:
642 return ret;
643}
644
645/*
646 * journal_remove_checkpoint: called after a buffer has been committed
647 * to disk (either by being write-back flushed to disk, or being
648 * committed to the log).
649 *
650 * We cannot safely clean a transaction out of the log until all of the
651 * buffer updates committed in that transaction have safely been stored
652 * elsewhere on disk. To achieve this, all of the buffers in a
653 * transaction need to be maintained on the transaction's checkpoint
654 * lists until they have been rewritten, at which point this function is
655 * called to remove the buffer from the existing transaction's
656 * checkpoint lists.
657 *
658 * The function returns 1 if it frees the transaction, 0 otherwise.
659 * The function can free jh and bh.
660 *
661 * This function is called with j_list_lock held.
662 * This function is called with jbd_lock_bh_state(jh2bh(jh))
663 */
664
665int __journal_remove_checkpoint(struct journal_head *jh)
666{
667 transaction_t *transaction;
668 journal_t *journal;
669 int ret = 0;
670
671 JBUFFER_TRACE(jh, "entry");
672
673 if ((transaction = jh->b_cp_transaction) == NULL) {
674 JBUFFER_TRACE(jh, "not on transaction");
675 goto out;
676 }
677 journal = transaction->t_journal;
678
679 JBUFFER_TRACE(jh, "removing from transaction");
680 __buffer_unlink(jh);
681 jh->b_cp_transaction = NULL;
682 journal_put_journal_head(jh);
683
684 if (transaction->t_checkpoint_list != NULL ||
685 transaction->t_checkpoint_io_list != NULL)
686 goto out;
687
688 /*
689 * There is one special case to worry about: if we have just pulled the
690 * buffer off a running or committing transaction's checkpoing list,
691 * then even if the checkpoint list is empty, the transaction obviously
692 * cannot be dropped!
693 *
694 * The locking here around t_state is a bit sleazy.
695 * See the comment at the end of journal_commit_transaction().
696 */
697 if (transaction->t_state != T_FINISHED)
698 goto out;
699
700 /* OK, that was the last buffer for the transaction: we can now
701 safely remove this transaction from the log */
702
703 __journal_drop_transaction(journal, transaction);
704
705 /* Just in case anybody was waiting for more transactions to be
706 checkpointed... */
707 wake_up(&journal->j_wait_logspace);
708 ret = 1;
709out:
710 return ret;
711}
712
713/*
714 * journal_insert_checkpoint: put a committed buffer onto a checkpoint
715 * list so that we know when it is safe to clean the transaction out of
716 * the log.
717 *
718 * Called with the journal locked.
719 * Called with j_list_lock held.
720 */
721void __journal_insert_checkpoint(struct journal_head *jh,
722 transaction_t *transaction)
723{
724 JBUFFER_TRACE(jh, "entry");
725 J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
726 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
727
728 /* Get reference for checkpointing transaction */
729 journal_grab_journal_head(jh2bh(jh));
730 jh->b_cp_transaction = transaction;
731
732 if (!transaction->t_checkpoint_list) {
733 jh->b_cpnext = jh->b_cpprev = jh;
734 } else {
735 jh->b_cpnext = transaction->t_checkpoint_list;
736 jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
737 jh->b_cpprev->b_cpnext = jh;
738 jh->b_cpnext->b_cpprev = jh;
739 }
740 transaction->t_checkpoint_list = jh;
741}
742
743/*
744 * We've finished with this transaction structure: adios...
745 *
746 * The transaction must have no links except for the checkpoint by this
747 * point.
748 *
749 * Called with the journal locked.
750 * Called with j_list_lock held.
751 */
752
753void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
754{
755 assert_spin_locked(&journal->j_list_lock);
756 if (transaction->t_cpnext) {
757 transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
758 transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
759 if (journal->j_checkpoint_transactions == transaction)
760 journal->j_checkpoint_transactions =
761 transaction->t_cpnext;
762 if (journal->j_checkpoint_transactions == transaction)
763 journal->j_checkpoint_transactions = NULL;
764 }
765
766 J_ASSERT(transaction->t_state == T_FINISHED);
767 J_ASSERT(transaction->t_buffers == NULL);
768 J_ASSERT(transaction->t_sync_datalist == NULL);
769 J_ASSERT(transaction->t_forget == NULL);
770 J_ASSERT(transaction->t_iobuf_list == NULL);
771 J_ASSERT(transaction->t_shadow_list == NULL);
772 J_ASSERT(transaction->t_log_list == NULL);
773 J_ASSERT(transaction->t_checkpoint_list == NULL);
774 J_ASSERT(transaction->t_checkpoint_io_list == NULL);
775 J_ASSERT(transaction->t_updates == 0);
776 J_ASSERT(journal->j_committing_transaction != transaction);
777 J_ASSERT(journal->j_running_transaction != transaction);
778
779 trace_jbd_drop_transaction(journal, transaction);
780 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
781 kfree(transaction);
782}
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
deleted file mode 100644
index bb217dcb41af..000000000000
--- a/fs/jbd/commit.c
+++ /dev/null
@@ -1,1021 +0,0 @@
1/*
2 * linux/fs/jbd/commit.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/jbd.h>
19#include <linux/errno.h>
20#include <linux/mm.h>
21#include <linux/pagemap.h>
22#include <linux/bio.h>
23#include <linux/blkdev.h>
24#include <trace/events/jbd.h>
25
26/*
27 * Default IO end handler for temporary BJ_IO buffer_heads.
28 */
29static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30{
31 BUFFER_TRACE(bh, "");
32 if (uptodate)
33 set_buffer_uptodate(bh);
34 else
35 clear_buffer_uptodate(bh);
36 unlock_buffer(bh);
37}
38
39/*
40 * When an ext3-ordered file is truncated, it is possible that many pages are
41 * not successfully freed, because they are attached to a committing transaction.
42 * After the transaction commits, these pages are left on the LRU, with no
43 * ->mapping, and with attached buffers. These pages are trivially reclaimable
44 * by the VM, but their apparent absence upsets the VM accounting, and it makes
45 * the numbers in /proc/meminfo look odd.
46 *
47 * So here, we have a buffer which has just come off the forget list. Look to
48 * see if we can strip all buffers from the backing page.
49 *
50 * Called under journal->j_list_lock. The caller provided us with a ref
51 * against the buffer, and we drop that here.
52 */
53static void release_buffer_page(struct buffer_head *bh)
54{
55 struct page *page;
56
57 if (buffer_dirty(bh))
58 goto nope;
59 if (atomic_read(&bh->b_count) != 1)
60 goto nope;
61 page = bh->b_page;
62 if (!page)
63 goto nope;
64 if (page->mapping)
65 goto nope;
66
67 /* OK, it's a truncated page */
68 if (!trylock_page(page))
69 goto nope;
70
71 page_cache_get(page);
72 __brelse(bh);
73 try_to_free_buffers(page);
74 unlock_page(page);
75 page_cache_release(page);
76 return;
77
78nope:
79 __brelse(bh);
80}
81
82/*
83 * Decrement reference counter for data buffer. If it has been marked
84 * 'BH_Freed', release it and the page to which it belongs if possible.
85 */
86static void release_data_buffer(struct buffer_head *bh)
87{
88 if (buffer_freed(bh)) {
89 WARN_ON_ONCE(buffer_dirty(bh));
90 clear_buffer_freed(bh);
91 clear_buffer_mapped(bh);
92 clear_buffer_new(bh);
93 clear_buffer_req(bh);
94 bh->b_bdev = NULL;
95 release_buffer_page(bh);
96 } else
97 put_bh(bh);
98}
99
100/*
101 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
102 * held. For ranking reasons we must trylock. If we lose, schedule away and
103 * return 0. j_list_lock is dropped in this case.
104 */
105static int inverted_lock(journal_t *journal, struct buffer_head *bh)
106{
107 if (!jbd_trylock_bh_state(bh)) {
108 spin_unlock(&journal->j_list_lock);
109 schedule();
110 return 0;
111 }
112 return 1;
113}
114
115/* Done it all: now write the commit record. We should have
116 * cleaned up our previous buffers by now, so if we are in abort
117 * mode we can now just skip the rest of the journal write
118 * entirely.
119 *
120 * Returns 1 if the journal needs to be aborted or 0 on success
121 */
122static int journal_write_commit_record(journal_t *journal,
123 transaction_t *commit_transaction)
124{
125 struct journal_head *descriptor;
126 struct buffer_head *bh;
127 journal_header_t *header;
128 int ret;
129
130 if (is_journal_aborted(journal))
131 return 0;
132
133 descriptor = journal_get_descriptor_buffer(journal);
134 if (!descriptor)
135 return 1;
136
137 bh = jh2bh(descriptor);
138
139 header = (journal_header_t *)(bh->b_data);
140 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
141 header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
142 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
143
144 JBUFFER_TRACE(descriptor, "write commit block");
145 set_buffer_dirty(bh);
146
147 if (journal->j_flags & JFS_BARRIER)
148 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
149 else
150 ret = sync_dirty_buffer(bh);
151
152 put_bh(bh); /* One for getblk() */
153 journal_put_journal_head(descriptor);
154
155 return (ret == -EIO);
156}
157
158static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
159 int write_op)
160{
161 int i;
162
163 for (i = 0; i < bufs; i++) {
164 wbuf[i]->b_end_io = end_buffer_write_sync;
165 /*
166 * Here we write back pagecache data that may be mmaped. Since
167 * we cannot afford to clean the page and set PageWriteback
168 * here due to lock ordering (page lock ranks above transaction
169 * start), the data can change while IO is in flight. Tell the
170 * block layer it should bounce the bio pages if stable data
171 * during write is required.
172 *
173 * We use up our safety reference in submit_bh().
174 */
175 _submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE);
176 }
177}
178
179/*
180 * Submit all the data buffers to disk
181 */
182static int journal_submit_data_buffers(journal_t *journal,
183 transaction_t *commit_transaction,
184 int write_op)
185{
186 struct journal_head *jh;
187 struct buffer_head *bh;
188 int locked;
189 int bufs = 0;
190 struct buffer_head **wbuf = journal->j_wbuf;
191 int err = 0;
192
193 /*
194 * Whenever we unlock the journal and sleep, things can get added
195 * onto ->t_sync_datalist, so we have to keep looping back to
196 * write_out_data until we *know* that the list is empty.
197 *
198 * Cleanup any flushed data buffers from the data list. Even in
199 * abort mode, we want to flush this out as soon as possible.
200 */
201write_out_data:
202 cond_resched();
203 spin_lock(&journal->j_list_lock);
204
205 while (commit_transaction->t_sync_datalist) {
206 jh = commit_transaction->t_sync_datalist;
207 bh = jh2bh(jh);
208 locked = 0;
209
210 /* Get reference just to make sure buffer does not disappear
211 * when we are forced to drop various locks */
212 get_bh(bh);
213 /* If the buffer is dirty, we need to submit IO and hence
214 * we need the buffer lock. We try to lock the buffer without
215 * blocking. If we fail, we need to drop j_list_lock and do
216 * blocking lock_buffer().
217 */
218 if (buffer_dirty(bh)) {
219 if (!trylock_buffer(bh)) {
220 BUFFER_TRACE(bh, "needs blocking lock");
221 spin_unlock(&journal->j_list_lock);
222 trace_jbd_do_submit_data(journal,
223 commit_transaction);
224 /* Write out all data to prevent deadlocks */
225 journal_do_submit_data(wbuf, bufs, write_op);
226 bufs = 0;
227 lock_buffer(bh);
228 spin_lock(&journal->j_list_lock);
229 }
230 locked = 1;
231 }
232 /* We have to get bh_state lock. Again out of order, sigh. */
233 if (!inverted_lock(journal, bh)) {
234 jbd_lock_bh_state(bh);
235 spin_lock(&journal->j_list_lock);
236 }
237 /* Someone already cleaned up the buffer? */
238 if (!buffer_jbd(bh) || bh2jh(bh) != jh
239 || jh->b_transaction != commit_transaction
240 || jh->b_jlist != BJ_SyncData) {
241 jbd_unlock_bh_state(bh);
242 if (locked)
243 unlock_buffer(bh);
244 BUFFER_TRACE(bh, "already cleaned up");
245 release_data_buffer(bh);
246 continue;
247 }
248 if (locked && test_clear_buffer_dirty(bh)) {
249 BUFFER_TRACE(bh, "needs writeout, adding to array");
250 wbuf[bufs++] = bh;
251 __journal_file_buffer(jh, commit_transaction,
252 BJ_Locked);
253 jbd_unlock_bh_state(bh);
254 if (bufs == journal->j_wbufsize) {
255 spin_unlock(&journal->j_list_lock);
256 trace_jbd_do_submit_data(journal,
257 commit_transaction);
258 journal_do_submit_data(wbuf, bufs, write_op);
259 bufs = 0;
260 goto write_out_data;
261 }
262 } else if (!locked && buffer_locked(bh)) {
263 __journal_file_buffer(jh, commit_transaction,
264 BJ_Locked);
265 jbd_unlock_bh_state(bh);
266 put_bh(bh);
267 } else {
268 BUFFER_TRACE(bh, "writeout complete: unfile");
269 if (unlikely(!buffer_uptodate(bh)))
270 err = -EIO;
271 __journal_unfile_buffer(jh);
272 jbd_unlock_bh_state(bh);
273 if (locked)
274 unlock_buffer(bh);
275 release_data_buffer(bh);
276 }
277
278 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
279 spin_unlock(&journal->j_list_lock);
280 goto write_out_data;
281 }
282 }
283 spin_unlock(&journal->j_list_lock);
284 trace_jbd_do_submit_data(journal, commit_transaction);
285 journal_do_submit_data(wbuf, bufs, write_op);
286
287 return err;
288}
289
290/*
291 * journal_commit_transaction
292 *
293 * The primary function for committing a transaction to the log. This
294 * function is called by the journal thread to begin a complete commit.
295 */
296void journal_commit_transaction(journal_t *journal)
297{
298 transaction_t *commit_transaction;
299 struct journal_head *jh, *new_jh, *descriptor;
300 struct buffer_head **wbuf = journal->j_wbuf;
301 int bufs;
302 int flags;
303 int err;
304 unsigned int blocknr;
305 ktime_t start_time;
306 u64 commit_time;
307 char *tagp = NULL;
308 journal_header_t *header;
309 journal_block_tag_t *tag = NULL;
310 int space_left = 0;
311 int first_tag = 0;
312 int tag_flag;
313 int i;
314 struct blk_plug plug;
315 int write_op = WRITE;
316
317 /*
318 * First job: lock down the current transaction and wait for
319 * all outstanding updates to complete.
320 */
321
322 /* Do we need to erase the effects of a prior journal_flush? */
323 if (journal->j_flags & JFS_FLUSHED) {
324 jbd_debug(3, "super block updated\n");
325 mutex_lock(&journal->j_checkpoint_mutex);
326 /*
327 * We hold j_checkpoint_mutex so tail cannot change under us.
328 * We don't need any special data guarantees for writing sb
329 * since journal is empty and it is ok for write to be
330 * flushed only with transaction commit.
331 */
332 journal_update_sb_log_tail(journal, journal->j_tail_sequence,
333 journal->j_tail, WRITE_SYNC);
334 mutex_unlock(&journal->j_checkpoint_mutex);
335 } else {
336 jbd_debug(3, "superblock not updated\n");
337 }
338
339 J_ASSERT(journal->j_running_transaction != NULL);
340 J_ASSERT(journal->j_committing_transaction == NULL);
341
342 commit_transaction = journal->j_running_transaction;
343
344 trace_jbd_start_commit(journal, commit_transaction);
345 jbd_debug(1, "JBD: starting commit of transaction %d\n",
346 commit_transaction->t_tid);
347
348 spin_lock(&journal->j_state_lock);
349 J_ASSERT(commit_transaction->t_state == T_RUNNING);
350 commit_transaction->t_state = T_LOCKED;
351
352 trace_jbd_commit_locking(journal, commit_transaction);
353 spin_lock(&commit_transaction->t_handle_lock);
354 while (commit_transaction->t_updates) {
355 DEFINE_WAIT(wait);
356
357 prepare_to_wait(&journal->j_wait_updates, &wait,
358 TASK_UNINTERRUPTIBLE);
359 if (commit_transaction->t_updates) {
360 spin_unlock(&commit_transaction->t_handle_lock);
361 spin_unlock(&journal->j_state_lock);
362 schedule();
363 spin_lock(&journal->j_state_lock);
364 spin_lock(&commit_transaction->t_handle_lock);
365 }
366 finish_wait(&journal->j_wait_updates, &wait);
367 }
368 spin_unlock(&commit_transaction->t_handle_lock);
369
370 J_ASSERT (commit_transaction->t_outstanding_credits <=
371 journal->j_max_transaction_buffers);
372
373 /*
374 * First thing we are allowed to do is to discard any remaining
375 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
376 * that there are no such buffers: if a large filesystem
377 * operation like a truncate needs to split itself over multiple
378 * transactions, then it may try to do a journal_restart() while
379 * there are still BJ_Reserved buffers outstanding. These must
380 * be released cleanly from the current transaction.
381 *
382 * In this case, the filesystem must still reserve write access
383 * again before modifying the buffer in the new transaction, but
384 * we do not require it to remember exactly which old buffers it
385 * has reserved. This is consistent with the existing behaviour
386 * that multiple journal_get_write_access() calls to the same
387 * buffer are perfectly permissible.
388 */
389 while (commit_transaction->t_reserved_list) {
390 jh = commit_transaction->t_reserved_list;
391 JBUFFER_TRACE(jh, "reserved, unused: refile");
392 /*
393 * A journal_get_undo_access()+journal_release_buffer() may
394 * leave undo-committed data.
395 */
396 if (jh->b_committed_data) {
397 struct buffer_head *bh = jh2bh(jh);
398
399 jbd_lock_bh_state(bh);
400 jbd_free(jh->b_committed_data, bh->b_size);
401 jh->b_committed_data = NULL;
402 jbd_unlock_bh_state(bh);
403 }
404 journal_refile_buffer(journal, jh);
405 }
406
407 /*
408 * Now try to drop any written-back buffers from the journal's
409 * checkpoint lists. We do this *before* commit because it potentially
410 * frees some memory
411 */
412 spin_lock(&journal->j_list_lock);
413 __journal_clean_checkpoint_list(journal);
414 spin_unlock(&journal->j_list_lock);
415
416 jbd_debug (3, "JBD: commit phase 1\n");
417
418 /*
419 * Clear revoked flag to reflect there is no revoked buffers
420 * in the next transaction which is going to be started.
421 */
422 journal_clear_buffer_revoked_flags(journal);
423
424 /*
425 * Switch to a new revoke table.
426 */
427 journal_switch_revoke_table(journal);
428
429 trace_jbd_commit_flushing(journal, commit_transaction);
430 commit_transaction->t_state = T_FLUSH;
431 journal->j_committing_transaction = commit_transaction;
432 journal->j_running_transaction = NULL;
433 start_time = ktime_get();
434 commit_transaction->t_log_start = journal->j_head;
435 wake_up(&journal->j_wait_transaction_locked);
436 spin_unlock(&journal->j_state_lock);
437
438 jbd_debug (3, "JBD: commit phase 2\n");
439
440 if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid))
441 write_op = WRITE_SYNC;
442
443 /*
444 * Now start flushing things to disk, in the order they appear
445 * on the transaction lists. Data blocks go first.
446 */
447 blk_start_plug(&plug);
448 err = journal_submit_data_buffers(journal, commit_transaction,
449 write_op);
450 blk_finish_plug(&plug);
451
452 /*
453 * Wait for all previously submitted IO to complete.
454 */
455 spin_lock(&journal->j_list_lock);
456 while (commit_transaction->t_locked_list) {
457 struct buffer_head *bh;
458
459 jh = commit_transaction->t_locked_list->b_tprev;
460 bh = jh2bh(jh);
461 get_bh(bh);
462 if (buffer_locked(bh)) {
463 spin_unlock(&journal->j_list_lock);
464 wait_on_buffer(bh);
465 spin_lock(&journal->j_list_lock);
466 }
467 if (unlikely(!buffer_uptodate(bh))) {
468 if (!trylock_page(bh->b_page)) {
469 spin_unlock(&journal->j_list_lock);
470 lock_page(bh->b_page);
471 spin_lock(&journal->j_list_lock);
472 }
473 if (bh->b_page->mapping)
474 set_bit(AS_EIO, &bh->b_page->mapping->flags);
475
476 unlock_page(bh->b_page);
477 SetPageError(bh->b_page);
478 err = -EIO;
479 }
480 if (!inverted_lock(journal, bh)) {
481 put_bh(bh);
482 spin_lock(&journal->j_list_lock);
483 continue;
484 }
485 if (buffer_jbd(bh) && bh2jh(bh) == jh &&
486 jh->b_transaction == commit_transaction &&
487 jh->b_jlist == BJ_Locked)
488 __journal_unfile_buffer(jh);
489 jbd_unlock_bh_state(bh);
490 release_data_buffer(bh);
491 cond_resched_lock(&journal->j_list_lock);
492 }
493 spin_unlock(&journal->j_list_lock);
494
495 if (err) {
496 char b[BDEVNAME_SIZE];
497
498 printk(KERN_WARNING
499 "JBD: Detected IO errors while flushing file data "
500 "on %s\n", bdevname(journal->j_fs_dev, b));
501 if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
502 journal_abort(journal, err);
503 err = 0;
504 }
505
506 blk_start_plug(&plug);
507
508 journal_write_revoke_records(journal, commit_transaction, write_op);
509
510 /*
511 * If we found any dirty or locked buffers, then we should have
512 * looped back up to the write_out_data label. If there weren't
513 * any then journal_clean_data_list should have wiped the list
514 * clean by now, so check that it is in fact empty.
515 */
516 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
517
518 jbd_debug (3, "JBD: commit phase 3\n");
519
520 /*
521 * Way to go: we have now written out all of the data for a
522 * transaction! Now comes the tricky part: we need to write out
523 * metadata. Loop over the transaction's entire buffer list:
524 */
525 spin_lock(&journal->j_state_lock);
526 commit_transaction->t_state = T_COMMIT;
527 spin_unlock(&journal->j_state_lock);
528
529 trace_jbd_commit_logging(journal, commit_transaction);
530 J_ASSERT(commit_transaction->t_nr_buffers <=
531 commit_transaction->t_outstanding_credits);
532
533 descriptor = NULL;
534 bufs = 0;
535 while (commit_transaction->t_buffers) {
536
537 /* Find the next buffer to be journaled... */
538
539 jh = commit_transaction->t_buffers;
540
541 /* If we're in abort mode, we just un-journal the buffer and
542 release it. */
543
544 if (is_journal_aborted(journal)) {
545 clear_buffer_jbddirty(jh2bh(jh));
546 JBUFFER_TRACE(jh, "journal is aborting: refile");
547 journal_refile_buffer(journal, jh);
548 /* If that was the last one, we need to clean up
549 * any descriptor buffers which may have been
550 * already allocated, even if we are now
551 * aborting. */
552 if (!commit_transaction->t_buffers)
553 goto start_journal_io;
554 continue;
555 }
556
557 /* Make sure we have a descriptor block in which to
558 record the metadata buffer. */
559
560 if (!descriptor) {
561 struct buffer_head *bh;
562
563 J_ASSERT (bufs == 0);
564
565 jbd_debug(4, "JBD: get descriptor\n");
566
567 descriptor = journal_get_descriptor_buffer(journal);
568 if (!descriptor) {
569 journal_abort(journal, -EIO);
570 continue;
571 }
572
573 bh = jh2bh(descriptor);
574 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
575 (unsigned long long)bh->b_blocknr, bh->b_data);
576 header = (journal_header_t *)&bh->b_data[0];
577 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
578 header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
579 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
580
581 tagp = &bh->b_data[sizeof(journal_header_t)];
582 space_left = bh->b_size - sizeof(journal_header_t);
583 first_tag = 1;
584 set_buffer_jwrite(bh);
585 set_buffer_dirty(bh);
586 wbuf[bufs++] = bh;
587
588 /* Record it so that we can wait for IO
589 completion later */
590 BUFFER_TRACE(bh, "ph3: file as descriptor");
591 journal_file_buffer(descriptor, commit_transaction,
592 BJ_LogCtl);
593 }
594
595 /* Where is the buffer to be written? */
596
597 err = journal_next_log_block(journal, &blocknr);
598 /* If the block mapping failed, just abandon the buffer
599 and repeat this loop: we'll fall into the
600 refile-on-abort condition above. */
601 if (err) {
602 journal_abort(journal, err);
603 continue;
604 }
605
606 /*
607 * start_this_handle() uses t_outstanding_credits to determine
608 * the free space in the log, but this counter is changed
609 * by journal_next_log_block() also.
610 */
611 commit_transaction->t_outstanding_credits--;
612
613 /* Bump b_count to prevent truncate from stumbling over
614 the shadowed buffer! @@@ This can go if we ever get
615 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
616 get_bh(jh2bh(jh));
617
618 /* Make a temporary IO buffer with which to write it out
619 (this will requeue both the metadata buffer and the
620 temporary IO buffer). new_bh goes on BJ_IO*/
621
622 set_buffer_jwrite(jh2bh(jh));
623 /*
624 * akpm: journal_write_metadata_buffer() sets
625 * new_bh->b_transaction to commit_transaction.
626 * We need to clean this up before we release new_bh
627 * (which is of type BJ_IO)
628 */
629 JBUFFER_TRACE(jh, "ph3: write metadata");
630 flags = journal_write_metadata_buffer(commit_transaction,
631 jh, &new_jh, blocknr);
632 set_buffer_jwrite(jh2bh(new_jh));
633 wbuf[bufs++] = jh2bh(new_jh);
634
635 /* Record the new block's tag in the current descriptor
636 buffer */
637
638 tag_flag = 0;
639 if (flags & 1)
640 tag_flag |= JFS_FLAG_ESCAPE;
641 if (!first_tag)
642 tag_flag |= JFS_FLAG_SAME_UUID;
643
644 tag = (journal_block_tag_t *) tagp;
645 tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
646 tag->t_flags = cpu_to_be32(tag_flag);
647 tagp += sizeof(journal_block_tag_t);
648 space_left -= sizeof(journal_block_tag_t);
649
650 if (first_tag) {
651 memcpy (tagp, journal->j_uuid, 16);
652 tagp += 16;
653 space_left -= 16;
654 first_tag = 0;
655 }
656
657 /* If there's no more to do, or if the descriptor is full,
658 let the IO rip! */
659
660 if (bufs == journal->j_wbufsize ||
661 commit_transaction->t_buffers == NULL ||
662 space_left < sizeof(journal_block_tag_t) + 16) {
663
664 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
665
666 /* Write an end-of-descriptor marker before
667 submitting the IOs. "tag" still points to
668 the last tag we set up. */
669
670 tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
671
672start_journal_io:
673 for (i = 0; i < bufs; i++) {
674 struct buffer_head *bh = wbuf[i];
675 lock_buffer(bh);
676 clear_buffer_dirty(bh);
677 set_buffer_uptodate(bh);
678 bh->b_end_io = journal_end_buffer_io_sync;
679 /*
680 * In data=journal mode, here we can end up
681 * writing pagecache data that might be
682 * mmapped. Since we can't afford to clean the
683 * page and set PageWriteback (see the comment
684 * near the other use of _submit_bh()), the
685 * data can change while the write is in
686 * flight. Tell the block layer to bounce the
687 * bio pages if stable pages are required.
688 */
689 _submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE);
690 }
691 cond_resched();
692
693 /* Force a new descriptor to be generated next
694 time round the loop. */
695 descriptor = NULL;
696 bufs = 0;
697 }
698 }
699
700 blk_finish_plug(&plug);
701
702 /* Lo and behold: we have just managed to send a transaction to
703 the log. Before we can commit it, wait for the IO so far to
704 complete. Control buffers being written are on the
705 transaction's t_log_list queue, and metadata buffers are on
706 the t_iobuf_list queue.
707
708 Wait for the buffers in reverse order. That way we are
709 less likely to be woken up until all IOs have completed, and
710 so we incur less scheduling load.
711 */
712
713 jbd_debug(3, "JBD: commit phase 4\n");
714
715 /*
716 * akpm: these are BJ_IO, and j_list_lock is not needed.
717 * See __journal_try_to_free_buffer.
718 */
719wait_for_iobuf:
720 while (commit_transaction->t_iobuf_list != NULL) {
721 struct buffer_head *bh;
722
723 jh = commit_transaction->t_iobuf_list->b_tprev;
724 bh = jh2bh(jh);
725 if (buffer_locked(bh)) {
726 wait_on_buffer(bh);
727 goto wait_for_iobuf;
728 }
729 if (cond_resched())
730 goto wait_for_iobuf;
731
732 if (unlikely(!buffer_uptodate(bh)))
733 err = -EIO;
734
735 clear_buffer_jwrite(bh);
736
737 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
738 journal_unfile_buffer(journal, jh);
739
740 /*
741 * ->t_iobuf_list should contain only dummy buffer_heads
742 * which were created by journal_write_metadata_buffer().
743 */
744 BUFFER_TRACE(bh, "dumping temporary bh");
745 journal_put_journal_head(jh);
746 __brelse(bh);
747 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
748 free_buffer_head(bh);
749
750 /* We also have to unlock and free the corresponding
751 shadowed buffer */
752 jh = commit_transaction->t_shadow_list->b_tprev;
753 bh = jh2bh(jh);
754 clear_buffer_jwrite(bh);
755 J_ASSERT_BH(bh, buffer_jbddirty(bh));
756
757 /* The metadata is now released for reuse, but we need
758 to remember it against this transaction so that when
759 we finally commit, we can do any checkpointing
760 required. */
761 JBUFFER_TRACE(jh, "file as BJ_Forget");
762 journal_file_buffer(jh, commit_transaction, BJ_Forget);
763 /*
764 * Wake up any transactions which were waiting for this
765 * IO to complete. The barrier must be here so that changes
766 * by journal_file_buffer() take effect before wake_up_bit()
767 * does the waitqueue check.
768 */
769 smp_mb();
770 wake_up_bit(&bh->b_state, BH_Unshadow);
771 JBUFFER_TRACE(jh, "brelse shadowed buffer");
772 __brelse(bh);
773 }
774
775 J_ASSERT (commit_transaction->t_shadow_list == NULL);
776
777 jbd_debug(3, "JBD: commit phase 5\n");
778
779 /* Here we wait for the revoke record and descriptor record buffers */
780 wait_for_ctlbuf:
781 while (commit_transaction->t_log_list != NULL) {
782 struct buffer_head *bh;
783
784 jh = commit_transaction->t_log_list->b_tprev;
785 bh = jh2bh(jh);
786 if (buffer_locked(bh)) {
787 wait_on_buffer(bh);
788 goto wait_for_ctlbuf;
789 }
790 if (cond_resched())
791 goto wait_for_ctlbuf;
792
793 if (unlikely(!buffer_uptodate(bh)))
794 err = -EIO;
795
796 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
797 clear_buffer_jwrite(bh);
798 journal_unfile_buffer(journal, jh);
799 journal_put_journal_head(jh);
800 __brelse(bh); /* One for getblk */
801 /* AKPM: bforget here */
802 }
803
804 if (err)
805 journal_abort(journal, err);
806
807 jbd_debug(3, "JBD: commit phase 6\n");
808
809 /* All metadata is written, now write commit record and do cleanup */
810 spin_lock(&journal->j_state_lock);
811 J_ASSERT(commit_transaction->t_state == T_COMMIT);
812 commit_transaction->t_state = T_COMMIT_RECORD;
813 spin_unlock(&journal->j_state_lock);
814
815 if (journal_write_commit_record(journal, commit_transaction))
816 err = -EIO;
817
818 if (err)
819 journal_abort(journal, err);
820
821 /* End of a transaction! Finally, we can do checkpoint
822 processing: any buffers committed as a result of this
823 transaction can be removed from any checkpoint list it was on
824 before. */
825
826 jbd_debug(3, "JBD: commit phase 7\n");
827
828 J_ASSERT(commit_transaction->t_sync_datalist == NULL);
829 J_ASSERT(commit_transaction->t_buffers == NULL);
830 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
831 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
832 J_ASSERT(commit_transaction->t_shadow_list == NULL);
833 J_ASSERT(commit_transaction->t_log_list == NULL);
834
835restart_loop:
836 /*
837 * As there are other places (journal_unmap_buffer()) adding buffers
838 * to this list we have to be careful and hold the j_list_lock.
839 */
840 spin_lock(&journal->j_list_lock);
841 while (commit_transaction->t_forget) {
842 transaction_t *cp_transaction;
843 struct buffer_head *bh;
844 int try_to_free = 0;
845
846 jh = commit_transaction->t_forget;
847 spin_unlock(&journal->j_list_lock);
848 bh = jh2bh(jh);
849 /*
850 * Get a reference so that bh cannot be freed before we are
851 * done with it.
852 */
853 get_bh(bh);
854 jbd_lock_bh_state(bh);
855 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
856 jh->b_transaction == journal->j_running_transaction);
857
858 /*
859 * If there is undo-protected committed data against
860 * this buffer, then we can remove it now. If it is a
861 * buffer needing such protection, the old frozen_data
862 * field now points to a committed version of the
863 * buffer, so rotate that field to the new committed
864 * data.
865 *
866 * Otherwise, we can just throw away the frozen data now.
867 */
868 if (jh->b_committed_data) {
869 jbd_free(jh->b_committed_data, bh->b_size);
870 jh->b_committed_data = NULL;
871 if (jh->b_frozen_data) {
872 jh->b_committed_data = jh->b_frozen_data;
873 jh->b_frozen_data = NULL;
874 }
875 } else if (jh->b_frozen_data) {
876 jbd_free(jh->b_frozen_data, bh->b_size);
877 jh->b_frozen_data = NULL;
878 }
879
880 spin_lock(&journal->j_list_lock);
881 cp_transaction = jh->b_cp_transaction;
882 if (cp_transaction) {
883 JBUFFER_TRACE(jh, "remove from old cp transaction");
884 __journal_remove_checkpoint(jh);
885 }
886
887 /* Only re-checkpoint the buffer_head if it is marked
888 * dirty. If the buffer was added to the BJ_Forget list
889 * by journal_forget, it may no longer be dirty and
890 * there's no point in keeping a checkpoint record for
891 * it. */
892
893 /*
894 * A buffer which has been freed while still being journaled by
895 * a previous transaction.
896 */
897 if (buffer_freed(bh)) {
898 /*
899 * If the running transaction is the one containing
900 * "add to orphan" operation (b_next_transaction !=
901 * NULL), we have to wait for that transaction to
902 * commit before we can really get rid of the buffer.
903 * So just clear b_modified to not confuse transaction
904 * credit accounting and refile the buffer to
905 * BJ_Forget of the running transaction. If the just
906 * committed transaction contains "add to orphan"
907 * operation, we can completely invalidate the buffer
908 * now. We are rather throughout in that since the
909 * buffer may be still accessible when blocksize <
910 * pagesize and it is attached to the last partial
911 * page.
912 */
913 jh->b_modified = 0;
914 if (!jh->b_next_transaction) {
915 clear_buffer_freed(bh);
916 clear_buffer_jbddirty(bh);
917 clear_buffer_mapped(bh);
918 clear_buffer_new(bh);
919 clear_buffer_req(bh);
920 bh->b_bdev = NULL;
921 }
922 }
923
924 if (buffer_jbddirty(bh)) {
925 JBUFFER_TRACE(jh, "add to new checkpointing trans");
926 __journal_insert_checkpoint(jh, commit_transaction);
927 if (is_journal_aborted(journal))
928 clear_buffer_jbddirty(bh);
929 } else {
930 J_ASSERT_BH(bh, !buffer_dirty(bh));
931 /*
932 * The buffer on BJ_Forget list and not jbddirty means
933 * it has been freed by this transaction and hence it
934 * could not have been reallocated until this
935 * transaction has committed. *BUT* it could be
936 * reallocated once we have written all the data to
937 * disk and before we process the buffer on BJ_Forget
938 * list.
939 */
940 if (!jh->b_next_transaction)
941 try_to_free = 1;
942 }
943 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
944 __journal_refile_buffer(jh);
945 jbd_unlock_bh_state(bh);
946 if (try_to_free)
947 release_buffer_page(bh);
948 else
949 __brelse(bh);
950 cond_resched_lock(&journal->j_list_lock);
951 }
952 spin_unlock(&journal->j_list_lock);
953 /*
954 * This is a bit sleazy. We use j_list_lock to protect transition
955 * of a transaction into T_FINISHED state and calling
956 * __journal_drop_transaction(). Otherwise we could race with
957 * other checkpointing code processing the transaction...
958 */
959 spin_lock(&journal->j_state_lock);
960 spin_lock(&journal->j_list_lock);
961 /*
962 * Now recheck if some buffers did not get attached to the transaction
963 * while the lock was dropped...
964 */
965 if (commit_transaction->t_forget) {
966 spin_unlock(&journal->j_list_lock);
967 spin_unlock(&journal->j_state_lock);
968 goto restart_loop;
969 }
970
971 /* Done with this transaction! */
972
973 jbd_debug(3, "JBD: commit phase 8\n");
974
975 J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
976
977 commit_transaction->t_state = T_FINISHED;
978 J_ASSERT(commit_transaction == journal->j_committing_transaction);
979 journal->j_commit_sequence = commit_transaction->t_tid;
980 journal->j_committing_transaction = NULL;
981 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
982
983 /*
984 * weight the commit time higher than the average time so we don't
985 * react too strongly to vast changes in commit time
986 */
987 if (likely(journal->j_average_commit_time))
988 journal->j_average_commit_time = (commit_time*3 +
989 journal->j_average_commit_time) / 4;
990 else
991 journal->j_average_commit_time = commit_time;
992
993 spin_unlock(&journal->j_state_lock);
994
995 if (commit_transaction->t_checkpoint_list == NULL &&
996 commit_transaction->t_checkpoint_io_list == NULL) {
997 __journal_drop_transaction(journal, commit_transaction);
998 } else {
999 if (journal->j_checkpoint_transactions == NULL) {
1000 journal->j_checkpoint_transactions = commit_transaction;
1001 commit_transaction->t_cpnext = commit_transaction;
1002 commit_transaction->t_cpprev = commit_transaction;
1003 } else {
1004 commit_transaction->t_cpnext =
1005 journal->j_checkpoint_transactions;
1006 commit_transaction->t_cpprev =
1007 commit_transaction->t_cpnext->t_cpprev;
1008 commit_transaction->t_cpnext->t_cpprev =
1009 commit_transaction;
1010 commit_transaction->t_cpprev->t_cpnext =
1011 commit_transaction;
1012 }
1013 }
1014 spin_unlock(&journal->j_list_lock);
1015
1016 trace_jbd_end_commit(journal, commit_transaction);
1017 jbd_debug(1, "JBD: commit %d complete, head %d\n",
1018 journal->j_commit_sequence, journal->j_tail_sequence);
1019
1020 wake_up(&journal->j_wait_done_commit);
1021}
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
deleted file mode 100644
index c46a79adb6ad..000000000000
--- a/fs/jbd/journal.c
+++ /dev/null
@@ -1,2145 +0,0 @@
1/*
2 * linux/fs/jbd/journal.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Generic filesystem journal-writing code; part of the ext2fs
13 * journaling system.
14 *
15 * This file manages journals: areas of disk reserved for logging
16 * transactional updates. This includes the kernel journaling thread
17 * which is responsible for scheduling updates to the log.
18 *
19 * We do not actually manage the physical storage of the journal in this
20 * file: that is left to a per-journal policy function, which allows us
21 * to store the journal within a filesystem-specified area for ext2
22 * journaling (ext2 can use a reserved inode for storing the log).
23 */
24
25#include <linux/module.h>
26#include <linux/time.h>
27#include <linux/fs.h>
28#include <linux/jbd.h>
29#include <linux/errno.h>
30#include <linux/slab.h>
31#include <linux/init.h>
32#include <linux/mm.h>
33#include <linux/freezer.h>
34#include <linux/pagemap.h>
35#include <linux/kthread.h>
36#include <linux/poison.h>
37#include <linux/proc_fs.h>
38#include <linux/debugfs.h>
39#include <linux/ratelimit.h>
40
41#define CREATE_TRACE_POINTS
42#include <trace/events/jbd.h>
43
44#include <asm/uaccess.h>
45#include <asm/page.h>
46
47EXPORT_SYMBOL(journal_start);
48EXPORT_SYMBOL(journal_restart);
49EXPORT_SYMBOL(journal_extend);
50EXPORT_SYMBOL(journal_stop);
51EXPORT_SYMBOL(journal_lock_updates);
52EXPORT_SYMBOL(journal_unlock_updates);
53EXPORT_SYMBOL(journal_get_write_access);
54EXPORT_SYMBOL(journal_get_create_access);
55EXPORT_SYMBOL(journal_get_undo_access);
56EXPORT_SYMBOL(journal_dirty_data);
57EXPORT_SYMBOL(journal_dirty_metadata);
58EXPORT_SYMBOL(journal_release_buffer);
59EXPORT_SYMBOL(journal_forget);
60#if 0
61EXPORT_SYMBOL(journal_sync_buffer);
62#endif
63EXPORT_SYMBOL(journal_flush);
64EXPORT_SYMBOL(journal_revoke);
65
66EXPORT_SYMBOL(journal_init_dev);
67EXPORT_SYMBOL(journal_init_inode);
68EXPORT_SYMBOL(journal_update_format);
69EXPORT_SYMBOL(journal_check_used_features);
70EXPORT_SYMBOL(journal_check_available_features);
71EXPORT_SYMBOL(journal_set_features);
72EXPORT_SYMBOL(journal_create);
73EXPORT_SYMBOL(journal_load);
74EXPORT_SYMBOL(journal_destroy);
75EXPORT_SYMBOL(journal_abort);
76EXPORT_SYMBOL(journal_errno);
77EXPORT_SYMBOL(journal_ack_err);
78EXPORT_SYMBOL(journal_clear_err);
79EXPORT_SYMBOL(log_wait_commit);
80EXPORT_SYMBOL(log_start_commit);
81EXPORT_SYMBOL(journal_start_commit);
82EXPORT_SYMBOL(journal_force_commit_nested);
83EXPORT_SYMBOL(journal_wipe);
84EXPORT_SYMBOL(journal_blocks_per_page);
85EXPORT_SYMBOL(journal_invalidatepage);
86EXPORT_SYMBOL(journal_try_to_free_buffers);
87EXPORT_SYMBOL(journal_force_commit);
88
89static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
90static void __journal_abort_soft (journal_t *journal, int errno);
91static const char *journal_dev_name(journal_t *journal, char *buffer);
92
93#ifdef CONFIG_JBD_DEBUG
94void __jbd_debug(int level, const char *file, const char *func,
95 unsigned int line, const char *fmt, ...)
96{
97 struct va_format vaf;
98 va_list args;
99
100 if (level > journal_enable_debug)
101 return;
102 va_start(args, fmt);
103 vaf.fmt = fmt;
104 vaf.va = &args;
105 printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
106 va_end(args);
107}
108EXPORT_SYMBOL(__jbd_debug);
109#endif
110
111/*
112 * Helper function used to manage commit timeouts
113 */
114
115static void commit_timeout(unsigned long __data)
116{
117 struct task_struct * p = (struct task_struct *) __data;
118
119 wake_up_process(p);
120}
121
122/*
123 * kjournald: The main thread function used to manage a logging device
124 * journal.
125 *
126 * This kernel thread is responsible for two things:
127 *
128 * 1) COMMIT: Every so often we need to commit the current state of the
129 * filesystem to disk. The journal thread is responsible for writing
130 * all of the metadata buffers to disk.
131 *
132 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
133 * of the data in that part of the log has been rewritten elsewhere on
134 * the disk. Flushing these old buffers to reclaim space in the log is
135 * known as checkpointing, and this thread is responsible for that job.
136 */
137
138static int kjournald(void *arg)
139{
140 journal_t *journal = arg;
141 transaction_t *transaction;
142
143 /*
144 * Set up an interval timer which can be used to trigger a commit wakeup
145 * after the commit interval expires
146 */
147 setup_timer(&journal->j_commit_timer, commit_timeout,
148 (unsigned long)current);
149
150 set_freezable();
151
152 /* Record that the journal thread is running */
153 journal->j_task = current;
154 wake_up(&journal->j_wait_done_commit);
155
156 printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n",
157 journal->j_commit_interval / HZ);
158
159 /*
160 * And now, wait forever for commit wakeup events.
161 */
162 spin_lock(&journal->j_state_lock);
163
164loop:
165 if (journal->j_flags & JFS_UNMOUNT)
166 goto end_loop;
167
168 jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
169 journal->j_commit_sequence, journal->j_commit_request);
170
171 if (journal->j_commit_sequence != journal->j_commit_request) {
172 jbd_debug(1, "OK, requests differ\n");
173 spin_unlock(&journal->j_state_lock);
174 del_timer_sync(&journal->j_commit_timer);
175 journal_commit_transaction(journal);
176 spin_lock(&journal->j_state_lock);
177 goto loop;
178 }
179
180 wake_up(&journal->j_wait_done_commit);
181 if (freezing(current)) {
182 /*
183 * The simpler the better. Flushing journal isn't a
184 * good idea, because that depends on threads that may
185 * be already stopped.
186 */
187 jbd_debug(1, "Now suspending kjournald\n");
188 spin_unlock(&journal->j_state_lock);
189 try_to_freeze();
190 spin_lock(&journal->j_state_lock);
191 } else {
192 /*
193 * We assume on resume that commits are already there,
194 * so we don't sleep
195 */
196 DEFINE_WAIT(wait);
197 int should_sleep = 1;
198
199 prepare_to_wait(&journal->j_wait_commit, &wait,
200 TASK_INTERRUPTIBLE);
201 if (journal->j_commit_sequence != journal->j_commit_request)
202 should_sleep = 0;
203 transaction = journal->j_running_transaction;
204 if (transaction && time_after_eq(jiffies,
205 transaction->t_expires))
206 should_sleep = 0;
207 if (journal->j_flags & JFS_UNMOUNT)
208 should_sleep = 0;
209 if (should_sleep) {
210 spin_unlock(&journal->j_state_lock);
211 schedule();
212 spin_lock(&journal->j_state_lock);
213 }
214 finish_wait(&journal->j_wait_commit, &wait);
215 }
216
217 jbd_debug(1, "kjournald wakes\n");
218
219 /*
220 * Were we woken up by a commit wakeup event?
221 */
222 transaction = journal->j_running_transaction;
223 if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
224 journal->j_commit_request = transaction->t_tid;
225 jbd_debug(1, "woke because of timeout\n");
226 }
227 goto loop;
228
229end_loop:
230 spin_unlock(&journal->j_state_lock);
231 del_timer_sync(&journal->j_commit_timer);
232 journal->j_task = NULL;
233 wake_up(&journal->j_wait_done_commit);
234 jbd_debug(1, "Journal thread exiting.\n");
235 return 0;
236}
237
238static int journal_start_thread(journal_t *journal)
239{
240 struct task_struct *t;
241
242 t = kthread_run(kjournald, journal, "kjournald");
243 if (IS_ERR(t))
244 return PTR_ERR(t);
245
246 wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
247 return 0;
248}
249
250static void journal_kill_thread(journal_t *journal)
251{
252 spin_lock(&journal->j_state_lock);
253 journal->j_flags |= JFS_UNMOUNT;
254
255 while (journal->j_task) {
256 wake_up(&journal->j_wait_commit);
257 spin_unlock(&journal->j_state_lock);
258 wait_event(journal->j_wait_done_commit,
259 journal->j_task == NULL);
260 spin_lock(&journal->j_state_lock);
261 }
262 spin_unlock(&journal->j_state_lock);
263}
264
265/*
266 * journal_write_metadata_buffer: write a metadata buffer to the journal.
267 *
268 * Writes a metadata buffer to a given disk block. The actual IO is not
269 * performed but a new buffer_head is constructed which labels the data
270 * to be written with the correct destination disk block.
271 *
272 * Any magic-number escaping which needs to be done will cause a
273 * copy-out here. If the buffer happens to start with the
274 * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the
275 * magic number is only written to the log for descripter blocks. In
276 * this case, we copy the data and replace the first word with 0, and we
277 * return a result code which indicates that this buffer needs to be
278 * marked as an escaped buffer in the corresponding log descriptor
279 * block. The missing word can then be restored when the block is read
280 * during recovery.
281 *
282 * If the source buffer has already been modified by a new transaction
283 * since we took the last commit snapshot, we use the frozen copy of
284 * that data for IO. If we end up using the existing buffer_head's data
285 * for the write, then we *have* to lock the buffer to prevent anyone
286 * else from using and possibly modifying it while the IO is in
287 * progress.
288 *
289 * The function returns a pointer to the buffer_heads to be used for IO.
290 *
291 * We assume that the journal has already been locked in this function.
292 *
293 * Return value:
294 * <0: Error
295 * >=0: Finished OK
296 *
297 * On success:
298 * Bit 0 set == escape performed on the data
299 * Bit 1 set == buffer copy-out performed (kfree the data after IO)
300 */
301
302int journal_write_metadata_buffer(transaction_t *transaction,
303 struct journal_head *jh_in,
304 struct journal_head **jh_out,
305 unsigned int blocknr)
306{
307 int need_copy_out = 0;
308 int done_copy_out = 0;
309 int do_escape = 0;
310 char *mapped_data;
311 struct buffer_head *new_bh;
312 struct journal_head *new_jh;
313 struct page *new_page;
314 unsigned int new_offset;
315 struct buffer_head *bh_in = jh2bh(jh_in);
316 journal_t *journal = transaction->t_journal;
317
318 /*
319 * The buffer really shouldn't be locked: only the current committing
320 * transaction is allowed to write it, so nobody else is allowed
321 * to do any IO.
322 *
323 * akpm: except if we're journalling data, and write() output is
324 * also part of a shared mapping, and another thread has
325 * decided to launch a writepage() against this buffer.
326 */
327 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
328
329 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
330 /* keep subsequent assertions sane */
331 atomic_set(&new_bh->b_count, 1);
332 new_jh = journal_add_journal_head(new_bh); /* This sleeps */
333
334 /*
335 * If a new transaction has already done a buffer copy-out, then
336 * we use that version of the data for the commit.
337 */
338 jbd_lock_bh_state(bh_in);
339repeat:
340 if (jh_in->b_frozen_data) {
341 done_copy_out = 1;
342 new_page = virt_to_page(jh_in->b_frozen_data);
343 new_offset = offset_in_page(jh_in->b_frozen_data);
344 } else {
345 new_page = jh2bh(jh_in)->b_page;
346 new_offset = offset_in_page(jh2bh(jh_in)->b_data);
347 }
348
349 mapped_data = kmap_atomic(new_page);
350 /*
351 * Check for escaping
352 */
353 if (*((__be32 *)(mapped_data + new_offset)) ==
354 cpu_to_be32(JFS_MAGIC_NUMBER)) {
355 need_copy_out = 1;
356 do_escape = 1;
357 }
358 kunmap_atomic(mapped_data);
359
360 /*
361 * Do we need to do a data copy?
362 */
363 if (need_copy_out && !done_copy_out) {
364 char *tmp;
365
366 jbd_unlock_bh_state(bh_in);
367 tmp = jbd_alloc(bh_in->b_size, GFP_NOFS);
368 jbd_lock_bh_state(bh_in);
369 if (jh_in->b_frozen_data) {
370 jbd_free(tmp, bh_in->b_size);
371 goto repeat;
372 }
373
374 jh_in->b_frozen_data = tmp;
375 mapped_data = kmap_atomic(new_page);
376 memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
377 kunmap_atomic(mapped_data);
378
379 new_page = virt_to_page(tmp);
380 new_offset = offset_in_page(tmp);
381 done_copy_out = 1;
382 }
383
384 /*
385 * Did we need to do an escaping? Now we've done all the
386 * copying, we can finally do so.
387 */
388 if (do_escape) {
389 mapped_data = kmap_atomic(new_page);
390 *((unsigned int *)(mapped_data + new_offset)) = 0;
391 kunmap_atomic(mapped_data);
392 }
393
394 set_bh_page(new_bh, new_page, new_offset);
395 new_jh->b_transaction = NULL;
396 new_bh->b_size = jh2bh(jh_in)->b_size;
397 new_bh->b_bdev = transaction->t_journal->j_dev;
398 new_bh->b_blocknr = blocknr;
399 set_buffer_mapped(new_bh);
400 set_buffer_dirty(new_bh);
401
402 *jh_out = new_jh;
403
404 /*
405 * The to-be-written buffer needs to get moved to the io queue,
406 * and the original buffer whose contents we are shadowing or
407 * copying is moved to the transaction's shadow queue.
408 */
409 JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
410 spin_lock(&journal->j_list_lock);
411 __journal_file_buffer(jh_in, transaction, BJ_Shadow);
412 spin_unlock(&journal->j_list_lock);
413 jbd_unlock_bh_state(bh_in);
414
415 JBUFFER_TRACE(new_jh, "file as BJ_IO");
416 journal_file_buffer(new_jh, transaction, BJ_IO);
417
418 return do_escape | (done_copy_out << 1);
419}
420
421/*
422 * Allocation code for the journal file. Manage the space left in the
423 * journal, so that we can begin checkpointing when appropriate.
424 */
425
426/*
427 * __log_space_left: Return the number of free blocks left in the journal.
428 *
429 * Called with the journal already locked.
430 *
431 * Called under j_state_lock
432 */
433
434int __log_space_left(journal_t *journal)
435{
436 int left = journal->j_free;
437
438 assert_spin_locked(&journal->j_state_lock);
439
440 /*
441 * Be pessimistic here about the number of those free blocks which
442 * might be required for log descriptor control blocks.
443 */
444
445#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
446
447 left -= MIN_LOG_RESERVED_BLOCKS;
448
449 if (left <= 0)
450 return 0;
451 left -= (left >> 3);
452 return left;
453}
454
455/*
456 * Called under j_state_lock. Returns true if a transaction commit was started.
457 */
458int __log_start_commit(journal_t *journal, tid_t target)
459{
460 /*
461 * The only transaction we can possibly wait upon is the
462 * currently running transaction (if it exists). Otherwise,
463 * the target tid must be an old one.
464 */
465 if (journal->j_commit_request != target &&
466 journal->j_running_transaction &&
467 journal->j_running_transaction->t_tid == target) {
468 /*
469 * We want a new commit: OK, mark the request and wakeup the
470 * commit thread. We do _not_ do the commit ourselves.
471 */
472
473 journal->j_commit_request = target;
474 jbd_debug(1, "JBD: requesting commit %d/%d\n",
475 journal->j_commit_request,
476 journal->j_commit_sequence);
477 wake_up(&journal->j_wait_commit);
478 return 1;
479 } else if (!tid_geq(journal->j_commit_request, target))
480 /* This should never happen, but if it does, preserve
481 the evidence before kjournald goes into a loop and
482 increments j_commit_sequence beyond all recognition. */
483 WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
484 journal->j_commit_request, journal->j_commit_sequence,
485 target, journal->j_running_transaction ?
486 journal->j_running_transaction->t_tid : 0);
487 return 0;
488}
489
490int log_start_commit(journal_t *journal, tid_t tid)
491{
492 int ret;
493
494 spin_lock(&journal->j_state_lock);
495 ret = __log_start_commit(journal, tid);
496 spin_unlock(&journal->j_state_lock);
497 return ret;
498}
499
500/*
501 * Force and wait upon a commit if the calling process is not within
502 * transaction. This is used for forcing out undo-protected data which contains
503 * bitmaps, when the fs is running out of space.
504 *
505 * We can only force the running transaction if we don't have an active handle;
506 * otherwise, we will deadlock.
507 *
508 * Returns true if a transaction was started.
509 */
510int journal_force_commit_nested(journal_t *journal)
511{
512 transaction_t *transaction = NULL;
513 tid_t tid;
514
515 spin_lock(&journal->j_state_lock);
516 if (journal->j_running_transaction && !current->journal_info) {
517 transaction = journal->j_running_transaction;
518 __log_start_commit(journal, transaction->t_tid);
519 } else if (journal->j_committing_transaction)
520 transaction = journal->j_committing_transaction;
521
522 if (!transaction) {
523 spin_unlock(&journal->j_state_lock);
524 return 0; /* Nothing to retry */
525 }
526
527 tid = transaction->t_tid;
528 spin_unlock(&journal->j_state_lock);
529 log_wait_commit(journal, tid);
530 return 1;
531}
532
533/*
534 * Start a commit of the current running transaction (if any). Returns true
535 * if a transaction is going to be committed (or is currently already
536 * committing), and fills its tid in at *ptid
537 */
538int journal_start_commit(journal_t *journal, tid_t *ptid)
539{
540 int ret = 0;
541
542 spin_lock(&journal->j_state_lock);
543 if (journal->j_running_transaction) {
544 tid_t tid = journal->j_running_transaction->t_tid;
545
546 __log_start_commit(journal, tid);
547 /* There's a running transaction and we've just made sure
548 * it's commit has been scheduled. */
549 if (ptid)
550 *ptid = tid;
551 ret = 1;
552 } else if (journal->j_committing_transaction) {
553 /*
554 * If commit has been started, then we have to wait for
555 * completion of that transaction.
556 */
557 if (ptid)
558 *ptid = journal->j_committing_transaction->t_tid;
559 ret = 1;
560 }
561 spin_unlock(&journal->j_state_lock);
562 return ret;
563}
564
565/*
566 * Wait for a specified commit to complete.
567 * The caller may not hold the journal lock.
568 */
569int log_wait_commit(journal_t *journal, tid_t tid)
570{
571 int err = 0;
572
573#ifdef CONFIG_JBD_DEBUG
574 spin_lock(&journal->j_state_lock);
575 if (!tid_geq(journal->j_commit_request, tid)) {
576 printk(KERN_ERR
577 "%s: error: j_commit_request=%d, tid=%d\n",
578 __func__, journal->j_commit_request, tid);
579 }
580 spin_unlock(&journal->j_state_lock);
581#endif
582 spin_lock(&journal->j_state_lock);
583 /*
584 * Not running or committing trans? Must be already committed. This
585 * saves us from waiting for a *long* time when tid overflows.
586 */
587 if (!((journal->j_running_transaction &&
588 journal->j_running_transaction->t_tid == tid) ||
589 (journal->j_committing_transaction &&
590 journal->j_committing_transaction->t_tid == tid)))
591 goto out_unlock;
592
593 if (!tid_geq(journal->j_commit_waited, tid))
594 journal->j_commit_waited = tid;
595 while (tid_gt(tid, journal->j_commit_sequence)) {
596 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
597 tid, journal->j_commit_sequence);
598 wake_up(&journal->j_wait_commit);
599 spin_unlock(&journal->j_state_lock);
600 wait_event(journal->j_wait_done_commit,
601 !tid_gt(tid, journal->j_commit_sequence));
602 spin_lock(&journal->j_state_lock);
603 }
604out_unlock:
605 spin_unlock(&journal->j_state_lock);
606
607 if (unlikely(is_journal_aborted(journal)))
608 err = -EIO;
609 return err;
610}
611
612/*
613 * Return 1 if a given transaction has not yet sent barrier request
614 * connected with a transaction commit. If 0 is returned, transaction
615 * may or may not have sent the barrier. Used to avoid sending barrier
616 * twice in common cases.
617 */
618int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
619{
620 int ret = 0;
621 transaction_t *commit_trans;
622
623 if (!(journal->j_flags & JFS_BARRIER))
624 return 0;
625 spin_lock(&journal->j_state_lock);
626 /* Transaction already committed? */
627 if (tid_geq(journal->j_commit_sequence, tid))
628 goto out;
629 /*
630 * Transaction is being committed and we already proceeded to
631 * writing commit record?
632 */
633 commit_trans = journal->j_committing_transaction;
634 if (commit_trans && commit_trans->t_tid == tid &&
635 commit_trans->t_state >= T_COMMIT_RECORD)
636 goto out;
637 ret = 1;
638out:
639 spin_unlock(&journal->j_state_lock);
640 return ret;
641}
642EXPORT_SYMBOL(journal_trans_will_send_data_barrier);
643
644/*
645 * Log buffer allocation routines:
646 */
647
648int journal_next_log_block(journal_t *journal, unsigned int *retp)
649{
650 unsigned int blocknr;
651
652 spin_lock(&journal->j_state_lock);
653 J_ASSERT(journal->j_free > 1);
654
655 blocknr = journal->j_head;
656 journal->j_head++;
657 journal->j_free--;
658 if (journal->j_head == journal->j_last)
659 journal->j_head = journal->j_first;
660 spin_unlock(&journal->j_state_lock);
661 return journal_bmap(journal, blocknr, retp);
662}
663
664/*
665 * Conversion of logical to physical block numbers for the journal
666 *
667 * On external journals the journal blocks are identity-mapped, so
668 * this is a no-op. If needed, we can use j_blk_offset - everything is
669 * ready.
670 */
671int journal_bmap(journal_t *journal, unsigned int blocknr,
672 unsigned int *retp)
673{
674 int err = 0;
675 unsigned int ret;
676
677 if (journal->j_inode) {
678 ret = bmap(journal->j_inode, blocknr);
679 if (ret)
680 *retp = ret;
681 else {
682 char b[BDEVNAME_SIZE];
683
684 printk(KERN_ALERT "%s: journal block not found "
685 "at offset %u on %s\n",
686 __func__,
687 blocknr,
688 bdevname(journal->j_dev, b));
689 err = -EIO;
690 __journal_abort_soft(journal, err);
691 }
692 } else {
693 *retp = blocknr; /* +journal->j_blk_offset */
694 }
695 return err;
696}
697
698/*
699 * We play buffer_head aliasing tricks to write data/metadata blocks to
700 * the journal without copying their contents, but for journal
701 * descriptor blocks we do need to generate bona fide buffers.
702 *
703 * After the caller of journal_get_descriptor_buffer() has finished modifying
704 * the buffer's contents they really should run flush_dcache_page(bh->b_page).
705 * But we don't bother doing that, so there will be coherency problems with
706 * mmaps of blockdevs which hold live JBD-controlled filesystems.
707 */
708struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
709{
710 struct buffer_head *bh;
711 unsigned int blocknr;
712 int err;
713
714 err = journal_next_log_block(journal, &blocknr);
715
716 if (err)
717 return NULL;
718
719 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
720 if (!bh)
721 return NULL;
722 lock_buffer(bh);
723 memset(bh->b_data, 0, journal->j_blocksize);
724 set_buffer_uptodate(bh);
725 unlock_buffer(bh);
726 BUFFER_TRACE(bh, "return this buffer");
727 return journal_add_journal_head(bh);
728}
729
730/*
731 * Management for journal control blocks: functions to create and
732 * destroy journal_t structures, and to initialise and read existing
733 * journal blocks from disk. */
734
735/* First: create and setup a journal_t object in memory. We initialise
736 * very few fields yet: that has to wait until we have created the
737 * journal structures from from scratch, or loaded them from disk. */
738
739static journal_t * journal_init_common (void)
740{
741 journal_t *journal;
742 int err;
743
744 journal = kzalloc(sizeof(*journal), GFP_KERNEL);
745 if (!journal)
746 goto fail;
747
748 init_waitqueue_head(&journal->j_wait_transaction_locked);
749 init_waitqueue_head(&journal->j_wait_logspace);
750 init_waitqueue_head(&journal->j_wait_done_commit);
751 init_waitqueue_head(&journal->j_wait_checkpoint);
752 init_waitqueue_head(&journal->j_wait_commit);
753 init_waitqueue_head(&journal->j_wait_updates);
754 mutex_init(&journal->j_checkpoint_mutex);
755 spin_lock_init(&journal->j_revoke_lock);
756 spin_lock_init(&journal->j_list_lock);
757 spin_lock_init(&journal->j_state_lock);
758
759 journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
760
761 /* The journal is marked for error until we succeed with recovery! */
762 journal->j_flags = JFS_ABORT;
763
764 /* Set up a default-sized revoke table for the new mount. */
765 err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
766 if (err) {
767 kfree(journal);
768 goto fail;
769 }
770 return journal;
771fail:
772 return NULL;
773}
774
775/* journal_init_dev and journal_init_inode:
776 *
777 * Create a journal structure assigned some fixed set of disk blocks to
778 * the journal. We don't actually touch those disk blocks yet, but we
779 * need to set up all of the mapping information to tell the journaling
780 * system where the journal blocks are.
781 *
782 */
783
784/**
785 * journal_t * journal_init_dev() - creates and initialises a journal structure
786 * @bdev: Block device on which to create the journal
787 * @fs_dev: Device which hold journalled filesystem for this journal.
788 * @start: Block nr Start of journal.
789 * @len: Length of the journal in blocks.
790 * @blocksize: blocksize of journalling device
791 *
792 * Returns: a newly created journal_t *
793 *
794 * journal_init_dev creates a journal which maps a fixed contiguous
795 * range of blocks on an arbitrary block device.
796 *
797 */
798journal_t * journal_init_dev(struct block_device *bdev,
799 struct block_device *fs_dev,
800 int start, int len, int blocksize)
801{
802 journal_t *journal = journal_init_common();
803 struct buffer_head *bh;
804 int n;
805
806 if (!journal)
807 return NULL;
808
809 /* journal descriptor can store up to n blocks -bzzz */
810 journal->j_blocksize = blocksize;
811 n = journal->j_blocksize / sizeof(journal_block_tag_t);
812 journal->j_wbufsize = n;
813 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
814 if (!journal->j_wbuf) {
815 printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
816 __func__);
817 goto out_err;
818 }
819 journal->j_dev = bdev;
820 journal->j_fs_dev = fs_dev;
821 journal->j_blk_offset = start;
822 journal->j_maxlen = len;
823
824 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
825 if (!bh) {
826 printk(KERN_ERR
827 "%s: Cannot get buffer for journal superblock\n",
828 __func__);
829 goto out_err;
830 }
831 journal->j_sb_buffer = bh;
832 journal->j_superblock = (journal_superblock_t *)bh->b_data;
833
834 return journal;
835out_err:
836 kfree(journal->j_wbuf);
837 kfree(journal);
838 return NULL;
839}
840
841/**
842 * journal_t * journal_init_inode () - creates a journal which maps to a inode.
843 * @inode: An inode to create the journal in
844 *
845 * journal_init_inode creates a journal which maps an on-disk inode as
846 * the journal. The inode must exist already, must support bmap() and
847 * must have all data blocks preallocated.
848 */
849journal_t * journal_init_inode (struct inode *inode)
850{
851 struct buffer_head *bh;
852 journal_t *journal = journal_init_common();
853 int err;
854 int n;
855 unsigned int blocknr;
856
857 if (!journal)
858 return NULL;
859
860 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
861 journal->j_inode = inode;
862 jbd_debug(1,
863 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
864 journal, inode->i_sb->s_id, inode->i_ino,
865 (long long) inode->i_size,
866 inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
867
868 journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
869 journal->j_blocksize = inode->i_sb->s_blocksize;
870
871 /* journal descriptor can store up to n blocks -bzzz */
872 n = journal->j_blocksize / sizeof(journal_block_tag_t);
873 journal->j_wbufsize = n;
874 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
875 if (!journal->j_wbuf) {
876 printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
877 __func__);
878 goto out_err;
879 }
880
881 err = journal_bmap(journal, 0, &blocknr);
882 /* If that failed, give up */
883 if (err) {
884 printk(KERN_ERR "%s: Cannot locate journal superblock\n",
885 __func__);
886 goto out_err;
887 }
888
889 bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
890 if (!bh) {
891 printk(KERN_ERR
892 "%s: Cannot get buffer for journal superblock\n",
893 __func__);
894 goto out_err;
895 }
896 journal->j_sb_buffer = bh;
897 journal->j_superblock = (journal_superblock_t *)bh->b_data;
898
899 return journal;
900out_err:
901 kfree(journal->j_wbuf);
902 kfree(journal);
903 return NULL;
904}
905
906/*
907 * If the journal init or create aborts, we need to mark the journal
908 * superblock as being NULL to prevent the journal destroy from writing
909 * back a bogus superblock.
910 */
911static void journal_fail_superblock (journal_t *journal)
912{
913 struct buffer_head *bh = journal->j_sb_buffer;
914 brelse(bh);
915 journal->j_sb_buffer = NULL;
916}
917
918/*
919 * Given a journal_t structure, initialise the various fields for
920 * startup of a new journaling session. We use this both when creating
921 * a journal, and after recovering an old journal to reset it for
922 * subsequent use.
923 */
924
925static int journal_reset(journal_t *journal)
926{
927 journal_superblock_t *sb = journal->j_superblock;
928 unsigned int first, last;
929
930 first = be32_to_cpu(sb->s_first);
931 last = be32_to_cpu(sb->s_maxlen);
932 if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
933 printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
934 first, last);
935 journal_fail_superblock(journal);
936 return -EINVAL;
937 }
938
939 journal->j_first = first;
940 journal->j_last = last;
941
942 journal->j_head = first;
943 journal->j_tail = first;
944 journal->j_free = last - first;
945
946 journal->j_tail_sequence = journal->j_transaction_sequence;
947 journal->j_commit_sequence = journal->j_transaction_sequence - 1;
948 journal->j_commit_request = journal->j_commit_sequence;
949
950 journal->j_max_transaction_buffers = journal->j_maxlen / 4;
951
952 /*
953 * As a special case, if the on-disk copy is already marked as needing
954 * no recovery (s_start == 0), then we can safely defer the superblock
955 * update until the next commit by setting JFS_FLUSHED. This avoids
956 * attempting a write to a potential-readonly device.
957 */
958 if (sb->s_start == 0) {
959 jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
960 "(start %u, seq %d, errno %d)\n",
961 journal->j_tail, journal->j_tail_sequence,
962 journal->j_errno);
963 journal->j_flags |= JFS_FLUSHED;
964 } else {
965 /* Lock here to make assertions happy... */
966 mutex_lock(&journal->j_checkpoint_mutex);
967 /*
968 * Update log tail information. We use WRITE_FUA since new
969 * transaction will start reusing journal space and so we
970 * must make sure information about current log tail is on
971 * disk before that.
972 */
973 journal_update_sb_log_tail(journal,
974 journal->j_tail_sequence,
975 journal->j_tail,
976 WRITE_FUA);
977 mutex_unlock(&journal->j_checkpoint_mutex);
978 }
979 return journal_start_thread(journal);
980}
981
982/**
983 * int journal_create() - Initialise the new journal file
984 * @journal: Journal to create. This structure must have been initialised
985 *
986 * Given a journal_t structure which tells us which disk blocks we can
987 * use, create a new journal superblock and initialise all of the
988 * journal fields from scratch.
989 **/
990int journal_create(journal_t *journal)
991{
992 unsigned int blocknr;
993 struct buffer_head *bh;
994 journal_superblock_t *sb;
995 int i, err;
996
997 if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
998 printk (KERN_ERR "Journal length (%d blocks) too short.\n",
999 journal->j_maxlen);
1000 journal_fail_superblock(journal);
1001 return -EINVAL;
1002 }
1003
1004 if (journal->j_inode == NULL) {
1005 /*
1006 * We don't know what block to start at!
1007 */
1008 printk(KERN_EMERG
1009 "%s: creation of journal on external device!\n",
1010 __func__);
1011 BUG();
1012 }
1013
1014 /* Zero out the entire journal on disk. We cannot afford to
1015 have any blocks on disk beginning with JFS_MAGIC_NUMBER. */
1016 jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
1017 for (i = 0; i < journal->j_maxlen; i++) {
1018 err = journal_bmap(journal, i, &blocknr);
1019 if (err)
1020 return err;
1021 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
1022 if (unlikely(!bh))
1023 return -ENOMEM;
1024 lock_buffer(bh);
1025 memset (bh->b_data, 0, journal->j_blocksize);
1026 BUFFER_TRACE(bh, "marking dirty");
1027 mark_buffer_dirty(bh);
1028 BUFFER_TRACE(bh, "marking uptodate");
1029 set_buffer_uptodate(bh);
1030 unlock_buffer(bh);
1031 __brelse(bh);
1032 }
1033
1034 sync_blockdev(journal->j_dev);
1035 jbd_debug(1, "JBD: journal cleared.\n");
1036
1037 /* OK, fill in the initial static fields in the new superblock */
1038 sb = journal->j_superblock;
1039
1040 sb->s_header.h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
1041 sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
1042
1043 sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
1044 sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
1045 sb->s_first = cpu_to_be32(1);
1046
1047 journal->j_transaction_sequence = 1;
1048
1049 journal->j_flags &= ~JFS_ABORT;
1050 journal->j_format_version = 2;
1051
1052 return journal_reset(journal);
1053}
1054
1055static void journal_write_superblock(journal_t *journal, int write_op)
1056{
1057 struct buffer_head *bh = journal->j_sb_buffer;
1058 int ret;
1059
1060 trace_journal_write_superblock(journal, write_op);
1061 if (!(journal->j_flags & JFS_BARRIER))
1062 write_op &= ~(REQ_FUA | REQ_FLUSH);
1063 lock_buffer(bh);
1064 if (buffer_write_io_error(bh)) {
1065 char b[BDEVNAME_SIZE];
1066 /*
1067 * Oh, dear. A previous attempt to write the journal
1068 * superblock failed. This could happen because the
1069 * USB device was yanked out. Or it could happen to
1070 * be a transient write error and maybe the block will
1071 * be remapped. Nothing we can do but to retry the
1072 * write and hope for the best.
1073 */
1074 printk(KERN_ERR "JBD: previous I/O error detected "
1075 "for journal superblock update for %s.\n",
1076 journal_dev_name(journal, b));
1077 clear_buffer_write_io_error(bh);
1078 set_buffer_uptodate(bh);
1079 }
1080
1081 get_bh(bh);
1082 bh->b_end_io = end_buffer_write_sync;
1083 ret = submit_bh(write_op, bh);
1084 wait_on_buffer(bh);
1085 if (buffer_write_io_error(bh)) {
1086 clear_buffer_write_io_error(bh);
1087 set_buffer_uptodate(bh);
1088 ret = -EIO;
1089 }
1090 if (ret) {
1091 char b[BDEVNAME_SIZE];
1092 printk(KERN_ERR "JBD: Error %d detected "
1093 "when updating journal superblock for %s.\n",
1094 ret, journal_dev_name(journal, b));
1095 }
1096}
1097
1098/**
1099 * journal_update_sb_log_tail() - Update log tail in journal sb on disk.
1100 * @journal: The journal to update.
1101 * @tail_tid: TID of the new transaction at the tail of the log
1102 * @tail_block: The first block of the transaction at the tail of the log
1103 * @write_op: With which operation should we write the journal sb
1104 *
1105 * Update a journal's superblock information about log tail and write it to
1106 * disk, waiting for the IO to complete.
1107 */
1108void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
1109 unsigned int tail_block, int write_op)
1110{
1111 journal_superblock_t *sb = journal->j_superblock;
1112
1113 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1114 jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n",
1115 tail_block, tail_tid);
1116
1117 sb->s_sequence = cpu_to_be32(tail_tid);
1118 sb->s_start = cpu_to_be32(tail_block);
1119
1120 journal_write_superblock(journal, write_op);
1121
1122 /* Log is no longer empty */
1123 spin_lock(&journal->j_state_lock);
1124 WARN_ON(!sb->s_sequence);
1125 journal->j_flags &= ~JFS_FLUSHED;
1126 spin_unlock(&journal->j_state_lock);
1127}
1128
1129/**
1130 * mark_journal_empty() - Mark on disk journal as empty.
1131 * @journal: The journal to update.
1132 *
1133 * Update a journal's dynamic superblock fields to show that journal is empty.
1134 * Write updated superblock to disk waiting for IO to complete.
1135 */
1136static void mark_journal_empty(journal_t *journal)
1137{
1138 journal_superblock_t *sb = journal->j_superblock;
1139
1140 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1141 spin_lock(&journal->j_state_lock);
1142 /* Is it already empty? */
1143 if (sb->s_start == 0) {
1144 spin_unlock(&journal->j_state_lock);
1145 return;
1146 }
1147 jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n",
1148 journal->j_tail_sequence);
1149
1150 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
1151 sb->s_start = cpu_to_be32(0);
1152 spin_unlock(&journal->j_state_lock);
1153
1154 journal_write_superblock(journal, WRITE_FUA);
1155
1156 spin_lock(&journal->j_state_lock);
1157 /* Log is empty */
1158 journal->j_flags |= JFS_FLUSHED;
1159 spin_unlock(&journal->j_state_lock);
1160}
1161
1162/**
1163 * journal_update_sb_errno() - Update error in the journal.
1164 * @journal: The journal to update.
1165 *
1166 * Update a journal's errno. Write updated superblock to disk waiting for IO
1167 * to complete.
1168 */
1169static void journal_update_sb_errno(journal_t *journal)
1170{
1171 journal_superblock_t *sb = journal->j_superblock;
1172
1173 spin_lock(&journal->j_state_lock);
1174 jbd_debug(1, "JBD: updating superblock error (errno %d)\n",
1175 journal->j_errno);
1176 sb->s_errno = cpu_to_be32(journal->j_errno);
1177 spin_unlock(&journal->j_state_lock);
1178
1179 journal_write_superblock(journal, WRITE_SYNC);
1180}
1181
1182/*
1183 * Read the superblock for a given journal, performing initial
1184 * validation of the format.
1185 */
1186
1187static int journal_get_superblock(journal_t *journal)
1188{
1189 struct buffer_head *bh;
1190 journal_superblock_t *sb;
1191 int err = -EIO;
1192
1193 bh = journal->j_sb_buffer;
1194
1195 J_ASSERT(bh != NULL);
1196 if (!buffer_uptodate(bh)) {
1197 ll_rw_block(READ, 1, &bh);
1198 wait_on_buffer(bh);
1199 if (!buffer_uptodate(bh)) {
1200 printk (KERN_ERR
1201 "JBD: IO error reading journal superblock\n");
1202 goto out;
1203 }
1204 }
1205
1206 sb = journal->j_superblock;
1207
1208 err = -EINVAL;
1209
1210 if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) ||
1211 sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
1212 printk(KERN_WARNING "JBD: no valid journal superblock found\n");
1213 goto out;
1214 }
1215
1216 switch(be32_to_cpu(sb->s_header.h_blocktype)) {
1217 case JFS_SUPERBLOCK_V1:
1218 journal->j_format_version = 1;
1219 break;
1220 case JFS_SUPERBLOCK_V2:
1221 journal->j_format_version = 2;
1222 break;
1223 default:
1224 printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
1225 goto out;
1226 }
1227
1228 if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
1229 journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
1230 else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
1231 printk (KERN_WARNING "JBD: journal file too short\n");
1232 goto out;
1233 }
1234
1235 if (be32_to_cpu(sb->s_first) == 0 ||
1236 be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
1237 printk(KERN_WARNING
1238 "JBD: Invalid start block of journal: %u\n",
1239 be32_to_cpu(sb->s_first));
1240 goto out;
1241 }
1242
1243 return 0;
1244
1245out:
1246 journal_fail_superblock(journal);
1247 return err;
1248}
1249
1250/*
1251 * Load the on-disk journal superblock and read the key fields into the
1252 * journal_t.
1253 */
1254
1255static int load_superblock(journal_t *journal)
1256{
1257 int err;
1258 journal_superblock_t *sb;
1259
1260 err = journal_get_superblock(journal);
1261 if (err)
1262 return err;
1263
1264 sb = journal->j_superblock;
1265
1266 journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
1267 journal->j_tail = be32_to_cpu(sb->s_start);
1268 journal->j_first = be32_to_cpu(sb->s_first);
1269 journal->j_last = be32_to_cpu(sb->s_maxlen);
1270 journal->j_errno = be32_to_cpu(sb->s_errno);
1271
1272 return 0;
1273}
1274
1275
1276/**
1277 * int journal_load() - Read journal from disk.
1278 * @journal: Journal to act on.
1279 *
1280 * Given a journal_t structure which tells us which disk blocks contain
1281 * a journal, read the journal from disk to initialise the in-memory
1282 * structures.
1283 */
1284int journal_load(journal_t *journal)
1285{
1286 int err;
1287 journal_superblock_t *sb;
1288
1289 err = load_superblock(journal);
1290 if (err)
1291 return err;
1292
1293 sb = journal->j_superblock;
1294 /* If this is a V2 superblock, then we have to check the
1295 * features flags on it. */
1296
1297 if (journal->j_format_version >= 2) {
1298 if ((sb->s_feature_ro_compat &
1299 ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) ||
1300 (sb->s_feature_incompat &
1301 ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) {
1302 printk (KERN_WARNING
1303 "JBD: Unrecognised features on journal\n");
1304 return -EINVAL;
1305 }
1306 }
1307
1308 /* Let the recovery code check whether it needs to recover any
1309 * data from the journal. */
1310 if (journal_recover(journal))
1311 goto recovery_error;
1312
1313 /* OK, we've finished with the dynamic journal bits:
1314 * reinitialise the dynamic contents of the superblock in memory
1315 * and reset them on disk. */
1316 if (journal_reset(journal))
1317 goto recovery_error;
1318
1319 journal->j_flags &= ~JFS_ABORT;
1320 journal->j_flags |= JFS_LOADED;
1321 return 0;
1322
1323recovery_error:
1324 printk (KERN_WARNING "JBD: recovery failed\n");
1325 return -EIO;
1326}
1327
1328/**
1329 * void journal_destroy() - Release a journal_t structure.
1330 * @journal: Journal to act on.
1331 *
1332 * Release a journal_t structure once it is no longer in use by the
1333 * journaled object.
1334 * Return <0 if we couldn't clean up the journal.
1335 */
1336int journal_destroy(journal_t *journal)
1337{
1338 int err = 0;
1339
1340
1341 /* Wait for the commit thread to wake up and die. */
1342 journal_kill_thread(journal);
1343
1344 /* Force a final log commit */
1345 if (journal->j_running_transaction)
1346 journal_commit_transaction(journal);
1347
1348 /* Force any old transactions to disk */
1349
1350 /* We cannot race with anybody but must keep assertions happy */
1351 mutex_lock(&journal->j_checkpoint_mutex);
1352 /* Totally anal locking here... */
1353 spin_lock(&journal->j_list_lock);
1354 while (journal->j_checkpoint_transactions != NULL) {
1355 spin_unlock(&journal->j_list_lock);
1356 log_do_checkpoint(journal);
1357 spin_lock(&journal->j_list_lock);
1358 }
1359
1360 J_ASSERT(journal->j_running_transaction == NULL);
1361 J_ASSERT(journal->j_committing_transaction == NULL);
1362 J_ASSERT(journal->j_checkpoint_transactions == NULL);
1363 spin_unlock(&journal->j_list_lock);
1364
1365 if (journal->j_sb_buffer) {
1366 if (!is_journal_aborted(journal)) {
1367 journal->j_tail_sequence =
1368 ++journal->j_transaction_sequence;
1369 mark_journal_empty(journal);
1370 } else
1371 err = -EIO;
1372 brelse(journal->j_sb_buffer);
1373 }
1374 mutex_unlock(&journal->j_checkpoint_mutex);
1375
1376 iput(journal->j_inode);
1377 if (journal->j_revoke)
1378 journal_destroy_revoke(journal);
1379 kfree(journal->j_wbuf);
1380 kfree(journal);
1381
1382 return err;
1383}
1384
1385
1386/**
1387 *int journal_check_used_features () - Check if features specified are used.
1388 * @journal: Journal to check.
1389 * @compat: bitmask of compatible features
1390 * @ro: bitmask of features that force read-only mount
1391 * @incompat: bitmask of incompatible features
1392 *
1393 * Check whether the journal uses all of a given set of
1394 * features. Return true (non-zero) if it does.
1395 **/
1396
1397int journal_check_used_features (journal_t *journal, unsigned long compat,
1398 unsigned long ro, unsigned long incompat)
1399{
1400 journal_superblock_t *sb;
1401
1402 if (!compat && !ro && !incompat)
1403 return 1;
1404 if (journal->j_format_version == 1)
1405 return 0;
1406
1407 sb = journal->j_superblock;
1408
1409 if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
1410 ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
1411 ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
1412 return 1;
1413
1414 return 0;
1415}
1416
1417/**
1418 * int journal_check_available_features() - Check feature set in journalling layer
1419 * @journal: Journal to check.
1420 * @compat: bitmask of compatible features
1421 * @ro: bitmask of features that force read-only mount
1422 * @incompat: bitmask of incompatible features
1423 *
1424 * Check whether the journaling code supports the use of
1425 * all of a given set of features on this journal. Return true
1426 * (non-zero) if it can. */
1427
1428int journal_check_available_features (journal_t *journal, unsigned long compat,
1429 unsigned long ro, unsigned long incompat)
1430{
1431 if (!compat && !ro && !incompat)
1432 return 1;
1433
1434 /* We can support any known requested features iff the
1435 * superblock is in version 2. Otherwise we fail to support any
1436 * extended sb features. */
1437
1438 if (journal->j_format_version != 2)
1439 return 0;
1440
1441 if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat &&
1442 (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro &&
1443 (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat)
1444 return 1;
1445
1446 return 0;
1447}
1448
1449/**
1450 * int journal_set_features () - Mark a given journal feature in the superblock
1451 * @journal: Journal to act on.
1452 * @compat: bitmask of compatible features
1453 * @ro: bitmask of features that force read-only mount
1454 * @incompat: bitmask of incompatible features
1455 *
1456 * Mark a given journal feature as present on the
1457 * superblock. Returns true if the requested features could be set.
1458 *
1459 */
1460
1461int journal_set_features (journal_t *journal, unsigned long compat,
1462 unsigned long ro, unsigned long incompat)
1463{
1464 journal_superblock_t *sb;
1465
1466 if (journal_check_used_features(journal, compat, ro, incompat))
1467 return 1;
1468
1469 if (!journal_check_available_features(journal, compat, ro, incompat))
1470 return 0;
1471
1472 jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
1473 compat, ro, incompat);
1474
1475 sb = journal->j_superblock;
1476
1477 sb->s_feature_compat |= cpu_to_be32(compat);
1478 sb->s_feature_ro_compat |= cpu_to_be32(ro);
1479 sb->s_feature_incompat |= cpu_to_be32(incompat);
1480
1481 return 1;
1482}
1483
1484
1485/**
1486 * int journal_update_format () - Update on-disk journal structure.
1487 * @journal: Journal to act on.
1488 *
1489 * Given an initialised but unloaded journal struct, poke about in the
1490 * on-disk structure to update it to the most recent supported version.
1491 */
1492int journal_update_format (journal_t *journal)
1493{
1494 journal_superblock_t *sb;
1495 int err;
1496
1497 err = journal_get_superblock(journal);
1498 if (err)
1499 return err;
1500
1501 sb = journal->j_superblock;
1502
1503 switch (be32_to_cpu(sb->s_header.h_blocktype)) {
1504 case JFS_SUPERBLOCK_V2:
1505 return 0;
1506 case JFS_SUPERBLOCK_V1:
1507 return journal_convert_superblock_v1(journal, sb);
1508 default:
1509 break;
1510 }
1511 return -EINVAL;
1512}
1513
1514static int journal_convert_superblock_v1(journal_t *journal,
1515 journal_superblock_t *sb)
1516{
1517 int offset, blocksize;
1518 struct buffer_head *bh;
1519
1520 printk(KERN_WARNING
1521 "JBD: Converting superblock from version 1 to 2.\n");
1522
1523 /* Pre-initialise new fields to zero */
1524 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
1525 blocksize = be32_to_cpu(sb->s_blocksize);
1526 memset(&sb->s_feature_compat, 0, blocksize-offset);
1527
1528 sb->s_nr_users = cpu_to_be32(1);
1529 sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
1530 journal->j_format_version = 2;
1531
1532 bh = journal->j_sb_buffer;
1533 BUFFER_TRACE(bh, "marking dirty");
1534 mark_buffer_dirty(bh);
1535 sync_dirty_buffer(bh);
1536 return 0;
1537}
1538
1539
1540/**
1541 * int journal_flush () - Flush journal
1542 * @journal: Journal to act on.
1543 *
1544 * Flush all data for a given journal to disk and empty the journal.
1545 * Filesystems can use this when remounting readonly to ensure that
1546 * recovery does not need to happen on remount.
1547 */
1548
1549int journal_flush(journal_t *journal)
1550{
1551 int err = 0;
1552 transaction_t *transaction = NULL;
1553
1554 spin_lock(&journal->j_state_lock);
1555
1556 /* Force everything buffered to the log... */
1557 if (journal->j_running_transaction) {
1558 transaction = journal->j_running_transaction;
1559 __log_start_commit(journal, transaction->t_tid);
1560 } else if (journal->j_committing_transaction)
1561 transaction = journal->j_committing_transaction;
1562
1563 /* Wait for the log commit to complete... */
1564 if (transaction) {
1565 tid_t tid = transaction->t_tid;
1566
1567 spin_unlock(&journal->j_state_lock);
1568 log_wait_commit(journal, tid);
1569 } else {
1570 spin_unlock(&journal->j_state_lock);
1571 }
1572
1573 /* ...and flush everything in the log out to disk. */
1574 spin_lock(&journal->j_list_lock);
1575 while (!err && journal->j_checkpoint_transactions != NULL) {
1576 spin_unlock(&journal->j_list_lock);
1577 mutex_lock(&journal->j_checkpoint_mutex);
1578 err = log_do_checkpoint(journal);
1579 mutex_unlock(&journal->j_checkpoint_mutex);
1580 spin_lock(&journal->j_list_lock);
1581 }
1582 spin_unlock(&journal->j_list_lock);
1583
1584 if (is_journal_aborted(journal))
1585 return -EIO;
1586
1587 mutex_lock(&journal->j_checkpoint_mutex);
1588 cleanup_journal_tail(journal);
1589
1590 /* Finally, mark the journal as really needing no recovery.
1591 * This sets s_start==0 in the underlying superblock, which is
1592 * the magic code for a fully-recovered superblock. Any future
1593 * commits of data to the journal will restore the current
1594 * s_start value. */
1595 mark_journal_empty(journal);
1596 mutex_unlock(&journal->j_checkpoint_mutex);
1597 spin_lock(&journal->j_state_lock);
1598 J_ASSERT(!journal->j_running_transaction);
1599 J_ASSERT(!journal->j_committing_transaction);
1600 J_ASSERT(!journal->j_checkpoint_transactions);
1601 J_ASSERT(journal->j_head == journal->j_tail);
1602 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1603 spin_unlock(&journal->j_state_lock);
1604 return 0;
1605}
1606
1607/**
1608 * int journal_wipe() - Wipe journal contents
1609 * @journal: Journal to act on.
1610 * @write: flag (see below)
1611 *
1612 * Wipe out all of the contents of a journal, safely. This will produce
1613 * a warning if the journal contains any valid recovery information.
1614 * Must be called between journal_init_*() and journal_load().
1615 *
1616 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
1617 * we merely suppress recovery.
1618 */
1619
1620int journal_wipe(journal_t *journal, int write)
1621{
1622 int err = 0;
1623
1624 J_ASSERT (!(journal->j_flags & JFS_LOADED));
1625
1626 err = load_superblock(journal);
1627 if (err)
1628 return err;
1629
1630 if (!journal->j_tail)
1631 goto no_recovery;
1632
1633 printk (KERN_WARNING "JBD: %s recovery information on journal\n",
1634 write ? "Clearing" : "Ignoring");
1635
1636 err = journal_skip_recovery(journal);
1637 if (write) {
1638 /* Lock to make assertions happy... */
1639 mutex_lock(&journal->j_checkpoint_mutex);
1640 mark_journal_empty(journal);
1641 mutex_unlock(&journal->j_checkpoint_mutex);
1642 }
1643
1644 no_recovery:
1645 return err;
1646}
1647
1648/*
1649 * journal_dev_name: format a character string to describe on what
1650 * device this journal is present.
1651 */
1652
1653static const char *journal_dev_name(journal_t *journal, char *buffer)
1654{
1655 struct block_device *bdev;
1656
1657 if (journal->j_inode)
1658 bdev = journal->j_inode->i_sb->s_bdev;
1659 else
1660 bdev = journal->j_dev;
1661
1662 return bdevname(bdev, buffer);
1663}
1664
1665/*
1666 * Journal abort has very specific semantics, which we describe
1667 * for journal abort.
1668 *
1669 * Two internal function, which provide abort to te jbd layer
1670 * itself are here.
1671 */
1672
1673/*
1674 * Quick version for internal journal use (doesn't lock the journal).
1675 * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
1676 * and don't attempt to make any other journal updates.
1677 */
1678static void __journal_abort_hard(journal_t *journal)
1679{
1680 transaction_t *transaction;
1681 char b[BDEVNAME_SIZE];
1682
1683 if (journal->j_flags & JFS_ABORT)
1684 return;
1685
1686 printk(KERN_ERR "Aborting journal on device %s.\n",
1687 journal_dev_name(journal, b));
1688
1689 spin_lock(&journal->j_state_lock);
1690 journal->j_flags |= JFS_ABORT;
1691 transaction = journal->j_running_transaction;
1692 if (transaction)
1693 __log_start_commit(journal, transaction->t_tid);
1694 spin_unlock(&journal->j_state_lock);
1695}
1696
1697/* Soft abort: record the abort error status in the journal superblock,
1698 * but don't do any other IO. */
1699static void __journal_abort_soft (journal_t *journal, int errno)
1700{
1701 if (journal->j_flags & JFS_ABORT)
1702 return;
1703
1704 if (!journal->j_errno)
1705 journal->j_errno = errno;
1706
1707 __journal_abort_hard(journal);
1708
1709 if (errno)
1710 journal_update_sb_errno(journal);
1711}
1712
1713/**
1714 * void journal_abort () - Shutdown the journal immediately.
1715 * @journal: the journal to shutdown.
1716 * @errno: an error number to record in the journal indicating
1717 * the reason for the shutdown.
1718 *
1719 * Perform a complete, immediate shutdown of the ENTIRE
1720 * journal (not of a single transaction). This operation cannot be
1721 * undone without closing and reopening the journal.
1722 *
1723 * The journal_abort function is intended to support higher level error
1724 * recovery mechanisms such as the ext2/ext3 remount-readonly error
1725 * mode.
1726 *
1727 * Journal abort has very specific semantics. Any existing dirty,
1728 * unjournaled buffers in the main filesystem will still be written to
1729 * disk by bdflush, but the journaling mechanism will be suspended
1730 * immediately and no further transaction commits will be honoured.
1731 *
1732 * Any dirty, journaled buffers will be written back to disk without
1733 * hitting the journal. Atomicity cannot be guaranteed on an aborted
1734 * filesystem, but we _do_ attempt to leave as much data as possible
1735 * behind for fsck to use for cleanup.
1736 *
1737 * Any attempt to get a new transaction handle on a journal which is in
1738 * ABORT state will just result in an -EROFS error return. A
1739 * journal_stop on an existing handle will return -EIO if we have
1740 * entered abort state during the update.
1741 *
1742 * Recursive transactions are not disturbed by journal abort until the
1743 * final journal_stop, which will receive the -EIO error.
1744 *
1745 * Finally, the journal_abort call allows the caller to supply an errno
1746 * which will be recorded (if possible) in the journal superblock. This
1747 * allows a client to record failure conditions in the middle of a
1748 * transaction without having to complete the transaction to record the
1749 * failure to disk. ext3_error, for example, now uses this
1750 * functionality.
1751 *
1752 * Errors which originate from within the journaling layer will NOT
1753 * supply an errno; a null errno implies that absolutely no further
1754 * writes are done to the journal (unless there are any already in
1755 * progress).
1756 *
1757 */
1758
1759void journal_abort(journal_t *journal, int errno)
1760{
1761 __journal_abort_soft(journal, errno);
1762}
1763
1764/**
1765 * int journal_errno () - returns the journal's error state.
1766 * @journal: journal to examine.
1767 *
1768 * This is the errno numbet set with journal_abort(), the last
1769 * time the journal was mounted - if the journal was stopped
1770 * without calling abort this will be 0.
1771 *
1772 * If the journal has been aborted on this mount time -EROFS will
1773 * be returned.
1774 */
1775int journal_errno(journal_t *journal)
1776{
1777 int err;
1778
1779 spin_lock(&journal->j_state_lock);
1780 if (journal->j_flags & JFS_ABORT)
1781 err = -EROFS;
1782 else
1783 err = journal->j_errno;
1784 spin_unlock(&journal->j_state_lock);
1785 return err;
1786}
1787
1788/**
1789 * int journal_clear_err () - clears the journal's error state
1790 * @journal: journal to act on.
1791 *
1792 * An error must be cleared or Acked to take a FS out of readonly
1793 * mode.
1794 */
1795int journal_clear_err(journal_t *journal)
1796{
1797 int err = 0;
1798
1799 spin_lock(&journal->j_state_lock);
1800 if (journal->j_flags & JFS_ABORT)
1801 err = -EROFS;
1802 else
1803 journal->j_errno = 0;
1804 spin_unlock(&journal->j_state_lock);
1805 return err;
1806}
1807
1808/**
1809 * void journal_ack_err() - Ack journal err.
1810 * @journal: journal to act on.
1811 *
1812 * An error must be cleared or Acked to take a FS out of readonly
1813 * mode.
1814 */
1815void journal_ack_err(journal_t *journal)
1816{
1817 spin_lock(&journal->j_state_lock);
1818 if (journal->j_errno)
1819 journal->j_flags |= JFS_ACK_ERR;
1820 spin_unlock(&journal->j_state_lock);
1821}
1822
1823int journal_blocks_per_page(struct inode *inode)
1824{
1825 return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1826}
1827
1828/*
1829 * Journal_head storage management
1830 */
1831static struct kmem_cache *journal_head_cache;
1832#ifdef CONFIG_JBD_DEBUG
1833static atomic_t nr_journal_heads = ATOMIC_INIT(0);
1834#endif
1835
1836static int journal_init_journal_head_cache(void)
1837{
1838 int retval;
1839
1840 J_ASSERT(journal_head_cache == NULL);
1841 journal_head_cache = kmem_cache_create("journal_head",
1842 sizeof(struct journal_head),
1843 0, /* offset */
1844 SLAB_TEMPORARY, /* flags */
1845 NULL); /* ctor */
1846 retval = 0;
1847 if (!journal_head_cache) {
1848 retval = -ENOMEM;
1849 printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
1850 }
1851 return retval;
1852}
1853
1854static void journal_destroy_journal_head_cache(void)
1855{
1856 if (journal_head_cache) {
1857 kmem_cache_destroy(journal_head_cache);
1858 journal_head_cache = NULL;
1859 }
1860}
1861
1862/*
1863 * journal_head splicing and dicing
1864 */
1865static struct journal_head *journal_alloc_journal_head(void)
1866{
1867 struct journal_head *ret;
1868
1869#ifdef CONFIG_JBD_DEBUG
1870 atomic_inc(&nr_journal_heads);
1871#endif
1872 ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS);
1873 if (ret == NULL) {
1874 jbd_debug(1, "out of memory for journal_head\n");
1875 printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
1876 __func__);
1877
1878 while (ret == NULL) {
1879 yield();
1880 ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS);
1881 }
1882 }
1883 return ret;
1884}
1885
1886static void journal_free_journal_head(struct journal_head *jh)
1887{
1888#ifdef CONFIG_JBD_DEBUG
1889 atomic_dec(&nr_journal_heads);
1890 memset(jh, JBD_POISON_FREE, sizeof(*jh));
1891#endif
1892 kmem_cache_free(journal_head_cache, jh);
1893}
1894
1895/*
1896 * A journal_head is attached to a buffer_head whenever JBD has an
1897 * interest in the buffer.
1898 *
1899 * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
1900 * is set. This bit is tested in core kernel code where we need to take
1901 * JBD-specific actions. Testing the zeroness of ->b_private is not reliable
1902 * there.
1903 *
1904 * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
1905 *
1906 * When a buffer has its BH_JBD bit set it is immune from being released by
1907 * core kernel code, mainly via ->b_count.
1908 *
1909 * A journal_head is detached from its buffer_head when the journal_head's
1910 * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
1911 * transaction (b_cp_transaction) hold their references to b_jcount.
1912 *
1913 * Various places in the kernel want to attach a journal_head to a buffer_head
1914 * _before_ attaching the journal_head to a transaction. To protect the
1915 * journal_head in this situation, journal_add_journal_head elevates the
1916 * journal_head's b_jcount refcount by one. The caller must call
1917 * journal_put_journal_head() to undo this.
1918 *
1919 * So the typical usage would be:
1920 *
1921 * (Attach a journal_head if needed. Increments b_jcount)
1922 * struct journal_head *jh = journal_add_journal_head(bh);
1923 * ...
1924 * (Get another reference for transaction)
1925 * journal_grab_journal_head(bh);
1926 * jh->b_transaction = xxx;
1927 * (Put original reference)
1928 * journal_put_journal_head(jh);
1929 */
1930
1931/*
1932 * Give a buffer_head a journal_head.
1933 *
1934 * May sleep.
1935 */
1936struct journal_head *journal_add_journal_head(struct buffer_head *bh)
1937{
1938 struct journal_head *jh;
1939 struct journal_head *new_jh = NULL;
1940
1941repeat:
1942 if (!buffer_jbd(bh))
1943 new_jh = journal_alloc_journal_head();
1944
1945 jbd_lock_bh_journal_head(bh);
1946 if (buffer_jbd(bh)) {
1947 jh = bh2jh(bh);
1948 } else {
1949 J_ASSERT_BH(bh,
1950 (atomic_read(&bh->b_count) > 0) ||
1951 (bh->b_page && bh->b_page->mapping));
1952
1953 if (!new_jh) {
1954 jbd_unlock_bh_journal_head(bh);
1955 goto repeat;
1956 }
1957
1958 jh = new_jh;
1959 new_jh = NULL; /* We consumed it */
1960 set_buffer_jbd(bh);
1961 bh->b_private = jh;
1962 jh->b_bh = bh;
1963 get_bh(bh);
1964 BUFFER_TRACE(bh, "added journal_head");
1965 }
1966 jh->b_jcount++;
1967 jbd_unlock_bh_journal_head(bh);
1968 if (new_jh)
1969 journal_free_journal_head(new_jh);
1970 return bh->b_private;
1971}
1972
1973/*
1974 * Grab a ref against this buffer_head's journal_head. If it ended up not
1975 * having a journal_head, return NULL
1976 */
1977struct journal_head *journal_grab_journal_head(struct buffer_head *bh)
1978{
1979 struct journal_head *jh = NULL;
1980
1981 jbd_lock_bh_journal_head(bh);
1982 if (buffer_jbd(bh)) {
1983 jh = bh2jh(bh);
1984 jh->b_jcount++;
1985 }
1986 jbd_unlock_bh_journal_head(bh);
1987 return jh;
1988}
1989
1990static void __journal_remove_journal_head(struct buffer_head *bh)
1991{
1992 struct journal_head *jh = bh2jh(bh);
1993
1994 J_ASSERT_JH(jh, jh->b_jcount >= 0);
1995 J_ASSERT_JH(jh, jh->b_transaction == NULL);
1996 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1997 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
1998 J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
1999 J_ASSERT_BH(bh, buffer_jbd(bh));
2000 J_ASSERT_BH(bh, jh2bh(jh) == bh);
2001 BUFFER_TRACE(bh, "remove journal_head");
2002 if (jh->b_frozen_data) {
2003 printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
2004 jbd_free(jh->b_frozen_data, bh->b_size);
2005 }
2006 if (jh->b_committed_data) {
2007 printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
2008 jbd_free(jh->b_committed_data, bh->b_size);
2009 }
2010 bh->b_private = NULL;
2011 jh->b_bh = NULL; /* debug, really */
2012 clear_buffer_jbd(bh);
2013 journal_free_journal_head(jh);
2014}
2015
2016/*
2017 * Drop a reference on the passed journal_head. If it fell to zero then
2018 * release the journal_head from the buffer_head.
2019 */
2020void journal_put_journal_head(struct journal_head *jh)
2021{
2022 struct buffer_head *bh = jh2bh(jh);
2023
2024 jbd_lock_bh_journal_head(bh);
2025 J_ASSERT_JH(jh, jh->b_jcount > 0);
2026 --jh->b_jcount;
2027 if (!jh->b_jcount) {
2028 __journal_remove_journal_head(bh);
2029 jbd_unlock_bh_journal_head(bh);
2030 __brelse(bh);
2031 } else
2032 jbd_unlock_bh_journal_head(bh);
2033}
2034
2035/*
2036 * debugfs tunables
2037 */
2038#ifdef CONFIG_JBD_DEBUG
2039
2040u8 journal_enable_debug __read_mostly;
2041EXPORT_SYMBOL(journal_enable_debug);
2042
2043static struct dentry *jbd_debugfs_dir;
2044static struct dentry *jbd_debug;
2045
2046static void __init jbd_create_debugfs_entry(void)
2047{
2048 jbd_debugfs_dir = debugfs_create_dir("jbd", NULL);
2049 if (jbd_debugfs_dir)
2050 jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR,
2051 jbd_debugfs_dir,
2052 &journal_enable_debug);
2053}
2054
2055static void __exit jbd_remove_debugfs_entry(void)
2056{
2057 debugfs_remove(jbd_debug);
2058 debugfs_remove(jbd_debugfs_dir);
2059}
2060
2061#else
2062
2063static inline void jbd_create_debugfs_entry(void)
2064{
2065}
2066
2067static inline void jbd_remove_debugfs_entry(void)
2068{
2069}
2070
2071#endif
2072
2073struct kmem_cache *jbd_handle_cache;
2074
2075static int __init journal_init_handle_cache(void)
2076{
2077 jbd_handle_cache = kmem_cache_create("journal_handle",
2078 sizeof(handle_t),
2079 0, /* offset */
2080 SLAB_TEMPORARY, /* flags */
2081 NULL); /* ctor */
2082 if (jbd_handle_cache == NULL) {
2083 printk(KERN_EMERG "JBD: failed to create handle cache\n");
2084 return -ENOMEM;
2085 }
2086 return 0;
2087}
2088
2089static void journal_destroy_handle_cache(void)
2090{
2091 if (jbd_handle_cache)
2092 kmem_cache_destroy(jbd_handle_cache);
2093}
2094
2095/*
2096 * Module startup and shutdown
2097 */
2098
2099static int __init journal_init_caches(void)
2100{
2101 int ret;
2102
2103 ret = journal_init_revoke_caches();
2104 if (ret == 0)
2105 ret = journal_init_journal_head_cache();
2106 if (ret == 0)
2107 ret = journal_init_handle_cache();
2108 return ret;
2109}
2110
2111static void journal_destroy_caches(void)
2112{
2113 journal_destroy_revoke_caches();
2114 journal_destroy_journal_head_cache();
2115 journal_destroy_handle_cache();
2116}
2117
2118static int __init journal_init(void)
2119{
2120 int ret;
2121
2122 BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
2123
2124 ret = journal_init_caches();
2125 if (ret != 0)
2126 journal_destroy_caches();
2127 jbd_create_debugfs_entry();
2128 return ret;
2129}
2130
2131static void __exit journal_exit(void)
2132{
2133#ifdef CONFIG_JBD_DEBUG
2134 int n = atomic_read(&nr_journal_heads);
2135 if (n)
2136 printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n);
2137#endif
2138 jbd_remove_debugfs_entry();
2139 journal_destroy_caches();
2140}
2141
2142MODULE_LICENSE("GPL");
2143module_init(journal_init);
2144module_exit(journal_exit);
2145
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
deleted file mode 100644
index a748fe21465a..000000000000
--- a/fs/jbd/recovery.c
+++ /dev/null
@@ -1,594 +0,0 @@
1/*
2 * linux/fs/jbd/recovery.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 *
6 * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal recovery routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#ifndef __KERNEL__
17#include "jfs_user.h"
18#else
19#include <linux/time.h>
20#include <linux/fs.h>
21#include <linux/jbd.h>
22#include <linux/errno.h>
23#include <linux/blkdev.h>
24#endif
25
26/*
27 * Maintain information about the progress of the recovery job, so that
28 * the different passes can carry information between them.
29 */
30struct recovery_info
31{
32 tid_t start_transaction;
33 tid_t end_transaction;
34
35 int nr_replays;
36 int nr_revokes;
37 int nr_revoke_hits;
38};
39
40enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
41static int do_one_pass(journal_t *journal,
42 struct recovery_info *info, enum passtype pass);
43static int scan_revoke_records(journal_t *, struct buffer_head *,
44 tid_t, struct recovery_info *);
45
46#ifdef __KERNEL__
47
48/* Release readahead buffers after use */
49static void journal_brelse_array(struct buffer_head *b[], int n)
50{
51 while (--n >= 0)
52 brelse (b[n]);
53}
54
55
56/*
57 * When reading from the journal, we are going through the block device
58 * layer directly and so there is no readahead being done for us. We
59 * need to implement any readahead ourselves if we want it to happen at
60 * all. Recovery is basically one long sequential read, so make sure we
61 * do the IO in reasonably large chunks.
62 *
63 * This is not so critical that we need to be enormously clever about
64 * the readahead size, though. 128K is a purely arbitrary, good-enough
65 * fixed value.
66 */
67
68#define MAXBUF 8
69static int do_readahead(journal_t *journal, unsigned int start)
70{
71 int err;
72 unsigned int max, nbufs, next;
73 unsigned int blocknr;
74 struct buffer_head *bh;
75
76 struct buffer_head * bufs[MAXBUF];
77
78 /* Do up to 128K of readahead */
79 max = start + (128 * 1024 / journal->j_blocksize);
80 if (max > journal->j_maxlen)
81 max = journal->j_maxlen;
82
83 /* Do the readahead itself. We'll submit MAXBUF buffer_heads at
84 * a time to the block device IO layer. */
85
86 nbufs = 0;
87
88 for (next = start; next < max; next++) {
89 err = journal_bmap(journal, next, &blocknr);
90
91 if (err) {
92 printk (KERN_ERR "JBD: bad block at offset %u\n",
93 next);
94 goto failed;
95 }
96
97 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
98 if (!bh) {
99 err = -ENOMEM;
100 goto failed;
101 }
102
103 if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
104 bufs[nbufs++] = bh;
105 if (nbufs == MAXBUF) {
106 ll_rw_block(READ, nbufs, bufs);
107 journal_brelse_array(bufs, nbufs);
108 nbufs = 0;
109 }
110 } else
111 brelse(bh);
112 }
113
114 if (nbufs)
115 ll_rw_block(READ, nbufs, bufs);
116 err = 0;
117
118failed:
119 if (nbufs)
120 journal_brelse_array(bufs, nbufs);
121 return err;
122}
123
124#endif /* __KERNEL__ */
125
126
127/*
128 * Read a block from the journal
129 */
130
131static int jread(struct buffer_head **bhp, journal_t *journal,
132 unsigned int offset)
133{
134 int err;
135 unsigned int blocknr;
136 struct buffer_head *bh;
137
138 *bhp = NULL;
139
140 if (offset >= journal->j_maxlen) {
141 printk(KERN_ERR "JBD: corrupted journal superblock\n");
142 return -EIO;
143 }
144
145 err = journal_bmap(journal, offset, &blocknr);
146
147 if (err) {
148 printk (KERN_ERR "JBD: bad block at offset %u\n",
149 offset);
150 return err;
151 }
152
153 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
154 if (!bh)
155 return -ENOMEM;
156
157 if (!buffer_uptodate(bh)) {
158 /* If this is a brand new buffer, start readahead.
159 Otherwise, we assume we are already reading it. */
160 if (!buffer_req(bh))
161 do_readahead(journal, offset);
162 wait_on_buffer(bh);
163 }
164
165 if (!buffer_uptodate(bh)) {
166 printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
167 offset);
168 brelse(bh);
169 return -EIO;
170 }
171
172 *bhp = bh;
173 return 0;
174}
175
176
177/*
178 * Count the number of in-use tags in a journal descriptor block.
179 */
180
181static int count_tags(struct buffer_head *bh, int size)
182{
183 char * tagp;
184 journal_block_tag_t * tag;
185 int nr = 0;
186
187 tagp = &bh->b_data[sizeof(journal_header_t)];
188
189 while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
190 tag = (journal_block_tag_t *) tagp;
191
192 nr++;
193 tagp += sizeof(journal_block_tag_t);
194 if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID)))
195 tagp += 16;
196
197 if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG))
198 break;
199 }
200
201 return nr;
202}
203
204
205/* Make sure we wrap around the log correctly! */
206#define wrap(journal, var) \
207do { \
208 if (var >= (journal)->j_last) \
209 var -= ((journal)->j_last - (journal)->j_first); \
210} while (0)
211
212/**
213 * journal_recover - recovers a on-disk journal
214 * @journal: the journal to recover
215 *
216 * The primary function for recovering the log contents when mounting a
217 * journaled device.
218 *
219 * Recovery is done in three passes. In the first pass, we look for the
220 * end of the log. In the second, we assemble the list of revoke
221 * blocks. In the third and final pass, we replay any un-revoked blocks
222 * in the log.
223 */
224int journal_recover(journal_t *journal)
225{
226 int err, err2;
227 journal_superblock_t * sb;
228
229 struct recovery_info info;
230
231 memset(&info, 0, sizeof(info));
232 sb = journal->j_superblock;
233
234 /*
235 * The journal superblock's s_start field (the current log head)
236 * is always zero if, and only if, the journal was cleanly
237 * unmounted.
238 */
239
240 if (!sb->s_start) {
241 jbd_debug(1, "No recovery required, last transaction %d\n",
242 be32_to_cpu(sb->s_sequence));
243 journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
244 return 0;
245 }
246
247 err = do_one_pass(journal, &info, PASS_SCAN);
248 if (!err)
249 err = do_one_pass(journal, &info, PASS_REVOKE);
250 if (!err)
251 err = do_one_pass(journal, &info, PASS_REPLAY);
252
253 jbd_debug(1, "JBD: recovery, exit status %d, "
254 "recovered transactions %u to %u\n",
255 err, info.start_transaction, info.end_transaction);
256 jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n",
257 info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
258
259 /* Restart the log at the next transaction ID, thus invalidating
260 * any existing commit records in the log. */
261 journal->j_transaction_sequence = ++info.end_transaction;
262
263 journal_clear_revoke(journal);
264 err2 = sync_blockdev(journal->j_fs_dev);
265 if (!err)
266 err = err2;
267 /* Flush disk caches to get replayed data on the permanent storage */
268 if (journal->j_flags & JFS_BARRIER) {
269 err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
270 if (!err)
271 err = err2;
272 }
273
274 return err;
275}
276
277/**
278 * journal_skip_recovery - Start journal and wipe exiting records
279 * @journal: journal to startup
280 *
281 * Locate any valid recovery information from the journal and set up the
282 * journal structures in memory to ignore it (presumably because the
283 * caller has evidence that it is out of date).
284 * This function does'nt appear to be exorted..
285 *
286 * We perform one pass over the journal to allow us to tell the user how
287 * much recovery information is being erased, and to let us initialise
288 * the journal transaction sequence numbers to the next unused ID.
289 */
290int journal_skip_recovery(journal_t *journal)
291{
292 int err;
293 struct recovery_info info;
294
295 memset (&info, 0, sizeof(info));
296
297 err = do_one_pass(journal, &info, PASS_SCAN);
298
299 if (err) {
300 printk(KERN_ERR "JBD: error %d scanning journal\n", err);
301 ++journal->j_transaction_sequence;
302 } else {
303#ifdef CONFIG_JBD_DEBUG
304 int dropped = info.end_transaction -
305 be32_to_cpu(journal->j_superblock->s_sequence);
306 jbd_debug(1,
307 "JBD: ignoring %d transaction%s from the journal.\n",
308 dropped, (dropped == 1) ? "" : "s");
309#endif
310 journal->j_transaction_sequence = ++info.end_transaction;
311 }
312
313 journal->j_tail = 0;
314 return err;
315}
316
317static int do_one_pass(journal_t *journal,
318 struct recovery_info *info, enum passtype pass)
319{
320 unsigned int first_commit_ID, next_commit_ID;
321 unsigned int next_log_block;
322 int err, success = 0;
323 journal_superblock_t * sb;
324 journal_header_t * tmp;
325 struct buffer_head * bh;
326 unsigned int sequence;
327 int blocktype;
328
329 /*
330 * First thing is to establish what we expect to find in the log
331 * (in terms of transaction IDs), and where (in terms of log
332 * block offsets): query the superblock.
333 */
334
335 sb = journal->j_superblock;
336 next_commit_ID = be32_to_cpu(sb->s_sequence);
337 next_log_block = be32_to_cpu(sb->s_start);
338
339 first_commit_ID = next_commit_ID;
340 if (pass == PASS_SCAN)
341 info->start_transaction = first_commit_ID;
342
343 jbd_debug(1, "Starting recovery pass %d\n", pass);
344
345 /*
346 * Now we walk through the log, transaction by transaction,
347 * making sure that each transaction has a commit block in the
348 * expected place. Each complete transaction gets replayed back
349 * into the main filesystem.
350 */
351
352 while (1) {
353 int flags;
354 char * tagp;
355 journal_block_tag_t * tag;
356 struct buffer_head * obh;
357 struct buffer_head * nbh;
358
359 cond_resched();
360
361 /* If we already know where to stop the log traversal,
362 * check right now that we haven't gone past the end of
363 * the log. */
364
365 if (pass != PASS_SCAN)
366 if (tid_geq(next_commit_ID, info->end_transaction))
367 break;
368
369 jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
370 next_commit_ID, next_log_block, journal->j_last);
371
372 /* Skip over each chunk of the transaction looking
373 * either the next descriptor block or the final commit
374 * record. */
375
376 jbd_debug(3, "JBD: checking block %u\n", next_log_block);
377 err = jread(&bh, journal, next_log_block);
378 if (err)
379 goto failed;
380
381 next_log_block++;
382 wrap(journal, next_log_block);
383
384 /* What kind of buffer is it?
385 *
386 * If it is a descriptor block, check that it has the
387 * expected sequence number. Otherwise, we're all done
388 * here. */
389
390 tmp = (journal_header_t *)bh->b_data;
391
392 if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
393 brelse(bh);
394 break;
395 }
396
397 blocktype = be32_to_cpu(tmp->h_blocktype);
398 sequence = be32_to_cpu(tmp->h_sequence);
399 jbd_debug(3, "Found magic %d, sequence %d\n",
400 blocktype, sequence);
401
402 if (sequence != next_commit_ID) {
403 brelse(bh);
404 break;
405 }
406
407 /* OK, we have a valid descriptor block which matches
408 * all of the sequence number checks. What are we going
409 * to do with it? That depends on the pass... */
410
411 switch(blocktype) {
412 case JFS_DESCRIPTOR_BLOCK:
413 /* If it is a valid descriptor block, replay it
414 * in pass REPLAY; otherwise, just skip over the
415 * blocks it describes. */
416 if (pass != PASS_REPLAY) {
417 next_log_block +=
418 count_tags(bh, journal->j_blocksize);
419 wrap(journal, next_log_block);
420 brelse(bh);
421 continue;
422 }
423
424 /* A descriptor block: we can now write all of
425 * the data blocks. Yay, useful work is finally
426 * getting done here! */
427
428 tagp = &bh->b_data[sizeof(journal_header_t)];
429 while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
430 <= journal->j_blocksize) {
431 unsigned int io_block;
432
433 tag = (journal_block_tag_t *) tagp;
434 flags = be32_to_cpu(tag->t_flags);
435
436 io_block = next_log_block++;
437 wrap(journal, next_log_block);
438 err = jread(&obh, journal, io_block);
439 if (err) {
440 /* Recover what we can, but
441 * report failure at the end. */
442 success = err;
443 printk (KERN_ERR
444 "JBD: IO error %d recovering "
445 "block %u in log\n",
446 err, io_block);
447 } else {
448 unsigned int blocknr;
449
450 J_ASSERT(obh != NULL);
451 blocknr = be32_to_cpu(tag->t_blocknr);
452
453 /* If the block has been
454 * revoked, then we're all done
455 * here. */
456 if (journal_test_revoke
457 (journal, blocknr,
458 next_commit_ID)) {
459 brelse(obh);
460 ++info->nr_revoke_hits;
461 goto skip_write;
462 }
463
464 /* Find a buffer for the new
465 * data being restored */
466 nbh = __getblk(journal->j_fs_dev,
467 blocknr,
468 journal->j_blocksize);
469 if (nbh == NULL) {
470 printk(KERN_ERR
471 "JBD: Out of memory "
472 "during recovery.\n");
473 err = -ENOMEM;
474 brelse(bh);
475 brelse(obh);
476 goto failed;
477 }
478
479 lock_buffer(nbh);
480 memcpy(nbh->b_data, obh->b_data,
481 journal->j_blocksize);
482 if (flags & JFS_FLAG_ESCAPE) {
483 *((__be32 *)nbh->b_data) =
484 cpu_to_be32(JFS_MAGIC_NUMBER);
485 }
486
487 BUFFER_TRACE(nbh, "marking dirty");
488 set_buffer_uptodate(nbh);
489 mark_buffer_dirty(nbh);
490 BUFFER_TRACE(nbh, "marking uptodate");
491 ++info->nr_replays;
492 /* ll_rw_block(WRITE, 1, &nbh); */
493 unlock_buffer(nbh);
494 brelse(obh);
495 brelse(nbh);
496 }
497
498 skip_write:
499 tagp += sizeof(journal_block_tag_t);
500 if (!(flags & JFS_FLAG_SAME_UUID))
501 tagp += 16;
502
503 if (flags & JFS_FLAG_LAST_TAG)
504 break;
505 }
506
507 brelse(bh);
508 continue;
509
510 case JFS_COMMIT_BLOCK:
511 /* Found an expected commit block: not much to
512 * do other than move on to the next sequence
513 * number. */
514 brelse(bh);
515 next_commit_ID++;
516 continue;
517
518 case JFS_REVOKE_BLOCK:
519 /* If we aren't in the REVOKE pass, then we can
520 * just skip over this block. */
521 if (pass != PASS_REVOKE) {
522 brelse(bh);
523 continue;
524 }
525
526 err = scan_revoke_records(journal, bh,
527 next_commit_ID, info);
528 brelse(bh);
529 if (err)
530 goto failed;
531 continue;
532
533 default:
534 jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
535 blocktype);
536 brelse(bh);
537 goto done;
538 }
539 }
540
541 done:
542 /*
543 * We broke out of the log scan loop: either we came to the
544 * known end of the log or we found an unexpected block in the
545 * log. If the latter happened, then we know that the "current"
546 * transaction marks the end of the valid log.
547 */
548
549 if (pass == PASS_SCAN)
550 info->end_transaction = next_commit_ID;
551 else {
552 /* It's really bad news if different passes end up at
553 * different places (but possible due to IO errors). */
554 if (info->end_transaction != next_commit_ID) {
555 printk (KERN_ERR "JBD: recovery pass %d ended at "
556 "transaction %u, expected %u\n",
557 pass, next_commit_ID, info->end_transaction);
558 if (!success)
559 success = -EIO;
560 }
561 }
562
563 return success;
564
565 failed:
566 return err;
567}
568
569
570/* Scan a revoke record, marking all blocks mentioned as revoked. */
571
572static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
573 tid_t sequence, struct recovery_info *info)
574{
575 journal_revoke_header_t *header;
576 int offset, max;
577
578 header = (journal_revoke_header_t *) bh->b_data;
579 offset = sizeof(journal_revoke_header_t);
580 max = be32_to_cpu(header->r_count);
581
582 while (offset < max) {
583 unsigned int blocknr;
584 int err;
585
586 blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
587 offset += 4;
588 err = journal_set_revoke(journal, blocknr, sequence);
589 if (err)
590 return err;
591 ++info->nr_revokes;
592 }
593 return 0;
594}
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
deleted file mode 100644
index dcead636c33b..000000000000
--- a/fs/jbd/revoke.c
+++ /dev/null
@@ -1,733 +0,0 @@
1/*
2 * linux/fs/jbd/revoke.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
5 *
6 * Copyright 2000 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal revoke routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 *
15 * Revoke is the mechanism used to prevent old log records for deleted
16 * metadata from being replayed on top of newer data using the same
17 * blocks. The revoke mechanism is used in two separate places:
18 *
19 * + Commit: during commit we write the entire list of the current
20 * transaction's revoked blocks to the journal
21 *
22 * + Recovery: during recovery we record the transaction ID of all
23 * revoked blocks. If there are multiple revoke records in the log
24 * for a single block, only the last one counts, and if there is a log
25 * entry for a block beyond the last revoke, then that log entry still
26 * gets replayed.
27 *
28 * We can get interactions between revokes and new log data within a
29 * single transaction:
30 *
31 * Block is revoked and then journaled:
32 * The desired end result is the journaling of the new block, so we
33 * cancel the revoke before the transaction commits.
34 *
35 * Block is journaled and then revoked:
36 * The revoke must take precedence over the write of the block, so we
37 * need either to cancel the journal entry or to write the revoke
38 * later in the log than the log block. In this case, we choose the
39 * latter: journaling a block cancels any revoke record for that block
40 * in the current transaction, so any revoke for that block in the
41 * transaction must have happened after the block was journaled and so
42 * the revoke must take precedence.
43 *
44 * Block is revoked and then written as data:
45 * The data write is allowed to succeed, but the revoke is _not_
46 * cancelled. We still need to prevent old log records from
47 * overwriting the new data. We don't even need to clear the revoke
48 * bit here.
49 *
50 * We cache revoke status of a buffer in the current transaction in b_states
51 * bits. As the name says, revokevalid flag indicates that the cached revoke
52 * status of a buffer is valid and we can rely on the cached status.
53 *
54 * Revoke information on buffers is a tri-state value:
55 *
56 * RevokeValid clear: no cached revoke status, need to look it up
57 * RevokeValid set, Revoked clear:
58 * buffer has not been revoked, and cancel_revoke
59 * need do nothing.
60 * RevokeValid set, Revoked set:
61 * buffer has been revoked.
62 *
63 * Locking rules:
64 * We keep two hash tables of revoke records. One hashtable belongs to the
65 * running transaction (is pointed to by journal->j_revoke), the other one
66 * belongs to the committing transaction. Accesses to the second hash table
67 * happen only from the kjournald and no other thread touches this table. Also
68 * journal_switch_revoke_table() which switches which hashtable belongs to the
69 * running and which to the committing transaction is called only from
70 * kjournald. Therefore we need no locks when accessing the hashtable belonging
71 * to the committing transaction.
72 *
73 * All users operating on the hash table belonging to the running transaction
74 * have a handle to the transaction. Therefore they are safe from kjournald
75 * switching hash tables under them. For operations on the lists of entries in
76 * the hash table j_revoke_lock is used.
77 *
78 * Finally, also replay code uses the hash tables but at this moment no one else
79 * can touch them (filesystem isn't mounted yet) and hence no locking is
80 * needed.
81 */
82
83#ifndef __KERNEL__
84#include "jfs_user.h"
85#else
86#include <linux/time.h>
87#include <linux/fs.h>
88#include <linux/jbd.h>
89#include <linux/errno.h>
90#include <linux/slab.h>
91#include <linux/list.h>
92#include <linux/init.h>
93#include <linux/bio.h>
94#endif
95#include <linux/log2.h>
96#include <linux/hash.h>
97
98static struct kmem_cache *revoke_record_cache;
99static struct kmem_cache *revoke_table_cache;
100
101/* Each revoke record represents one single revoked block. During
102 journal replay, this involves recording the transaction ID of the
103 last transaction to revoke this block. */
104
105struct jbd_revoke_record_s
106{
107 struct list_head hash;
108 tid_t sequence; /* Used for recovery only */
109 unsigned int blocknr;
110};
111
112
113/* The revoke table is just a simple hash table of revoke records. */
114struct jbd_revoke_table_s
115{
116 /* It is conceivable that we might want a larger hash table
117 * for recovery. Must be a power of two. */
118 int hash_size;
119 int hash_shift;
120 struct list_head *hash_table;
121};
122
123
124#ifdef __KERNEL__
125static void write_one_revoke_record(journal_t *, transaction_t *,
126 struct journal_head **, int *,
127 struct jbd_revoke_record_s *, int);
128static void flush_descriptor(journal_t *, struct journal_head *, int, int);
129#endif
130
131/* Utility functions to maintain the revoke table */
132
133static inline int hash(journal_t *journal, unsigned int block)
134{
135 struct jbd_revoke_table_s *table = journal->j_revoke;
136
137 return hash_32(block, table->hash_shift);
138}
139
140static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
141 tid_t seq)
142{
143 struct list_head *hash_list;
144 struct jbd_revoke_record_s *record;
145
146repeat:
147 record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS);
148 if (!record)
149 goto oom;
150
151 record->sequence = seq;
152 record->blocknr = blocknr;
153 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
154 spin_lock(&journal->j_revoke_lock);
155 list_add(&record->hash, hash_list);
156 spin_unlock(&journal->j_revoke_lock);
157 return 0;
158
159oom:
160 if (!journal_oom_retry)
161 return -ENOMEM;
162 jbd_debug(1, "ENOMEM in %s, retrying\n", __func__);
163 yield();
164 goto repeat;
165}
166
167/* Find a revoke record in the journal's hash table. */
168
169static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
170 unsigned int blocknr)
171{
172 struct list_head *hash_list;
173 struct jbd_revoke_record_s *record;
174
175 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
176
177 spin_lock(&journal->j_revoke_lock);
178 record = (struct jbd_revoke_record_s *) hash_list->next;
179 while (&(record->hash) != hash_list) {
180 if (record->blocknr == blocknr) {
181 spin_unlock(&journal->j_revoke_lock);
182 return record;
183 }
184 record = (struct jbd_revoke_record_s *) record->hash.next;
185 }
186 spin_unlock(&journal->j_revoke_lock);
187 return NULL;
188}
189
190void journal_destroy_revoke_caches(void)
191{
192 if (revoke_record_cache) {
193 kmem_cache_destroy(revoke_record_cache);
194 revoke_record_cache = NULL;
195 }
196 if (revoke_table_cache) {
197 kmem_cache_destroy(revoke_table_cache);
198 revoke_table_cache = NULL;
199 }
200}
201
202int __init journal_init_revoke_caches(void)
203{
204 J_ASSERT(!revoke_record_cache);
205 J_ASSERT(!revoke_table_cache);
206
207 revoke_record_cache = kmem_cache_create("revoke_record",
208 sizeof(struct jbd_revoke_record_s),
209 0,
210 SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
211 NULL);
212 if (!revoke_record_cache)
213 goto record_cache_failure;
214
215 revoke_table_cache = kmem_cache_create("revoke_table",
216 sizeof(struct jbd_revoke_table_s),
217 0, SLAB_TEMPORARY, NULL);
218 if (!revoke_table_cache)
219 goto table_cache_failure;
220
221 return 0;
222
223table_cache_failure:
224 journal_destroy_revoke_caches();
225record_cache_failure:
226 return -ENOMEM;
227}
228
229static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
230{
231 int i;
232 struct jbd_revoke_table_s *table;
233
234 table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
235 if (!table)
236 goto out;
237
238 table->hash_size = hash_size;
239 table->hash_shift = ilog2(hash_size);
240 table->hash_table =
241 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
242 if (!table->hash_table) {
243 kmem_cache_free(revoke_table_cache, table);
244 table = NULL;
245 goto out;
246 }
247
248 for (i = 0; i < hash_size; i++)
249 INIT_LIST_HEAD(&table->hash_table[i]);
250
251out:
252 return table;
253}
254
255static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table)
256{
257 int i;
258 struct list_head *hash_list;
259
260 for (i = 0; i < table->hash_size; i++) {
261 hash_list = &table->hash_table[i];
262 J_ASSERT(list_empty(hash_list));
263 }
264
265 kfree(table->hash_table);
266 kmem_cache_free(revoke_table_cache, table);
267}
268
269/* Initialise the revoke table for a given journal to a given size. */
270int journal_init_revoke(journal_t *journal, int hash_size)
271{
272 J_ASSERT(journal->j_revoke_table[0] == NULL);
273 J_ASSERT(is_power_of_2(hash_size));
274
275 journal->j_revoke_table[0] = journal_init_revoke_table(hash_size);
276 if (!journal->j_revoke_table[0])
277 goto fail0;
278
279 journal->j_revoke_table[1] = journal_init_revoke_table(hash_size);
280 if (!journal->j_revoke_table[1])
281 goto fail1;
282
283 journal->j_revoke = journal->j_revoke_table[1];
284
285 spin_lock_init(&journal->j_revoke_lock);
286
287 return 0;
288
289fail1:
290 journal_destroy_revoke_table(journal->j_revoke_table[0]);
291fail0:
292 return -ENOMEM;
293}
294
295/* Destroy a journal's revoke table. The table must already be empty! */
296void journal_destroy_revoke(journal_t *journal)
297{
298 journal->j_revoke = NULL;
299 if (journal->j_revoke_table[0])
300 journal_destroy_revoke_table(journal->j_revoke_table[0]);
301 if (journal->j_revoke_table[1])
302 journal_destroy_revoke_table(journal->j_revoke_table[1]);
303}
304
305
306#ifdef __KERNEL__
307
308/*
309 * journal_revoke: revoke a given buffer_head from the journal. This
310 * prevents the block from being replayed during recovery if we take a
311 * crash after this current transaction commits. Any subsequent
312 * metadata writes of the buffer in this transaction cancel the
313 * revoke.
314 *
315 * Note that this call may block --- it is up to the caller to make
316 * sure that there are no further calls to journal_write_metadata
317 * before the revoke is complete. In ext3, this implies calling the
318 * revoke before clearing the block bitmap when we are deleting
319 * metadata.
320 *
321 * Revoke performs a journal_forget on any buffer_head passed in as a
322 * parameter, but does _not_ forget the buffer_head if the bh was only
323 * found implicitly.
324 *
325 * bh_in may not be a journalled buffer - it may have come off
326 * the hash tables without an attached journal_head.
327 *
328 * If bh_in is non-zero, journal_revoke() will decrement its b_count
329 * by one.
330 */
331
332int journal_revoke(handle_t *handle, unsigned int blocknr,
333 struct buffer_head *bh_in)
334{
335 struct buffer_head *bh = NULL;
336 journal_t *journal;
337 struct block_device *bdev;
338 int err;
339
340 might_sleep();
341 if (bh_in)
342 BUFFER_TRACE(bh_in, "enter");
343
344 journal = handle->h_transaction->t_journal;
345 if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){
346 J_ASSERT (!"Cannot set revoke feature!");
347 return -EINVAL;
348 }
349
350 bdev = journal->j_fs_dev;
351 bh = bh_in;
352
353 if (!bh) {
354 bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
355 if (bh)
356 BUFFER_TRACE(bh, "found on hash");
357 }
358#ifdef JBD_EXPENSIVE_CHECKING
359 else {
360 struct buffer_head *bh2;
361
362 /* If there is a different buffer_head lying around in
363 * memory anywhere... */
364 bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
365 if (bh2) {
366 /* ... and it has RevokeValid status... */
367 if (bh2 != bh && buffer_revokevalid(bh2))
368 /* ...then it better be revoked too,
369 * since it's illegal to create a revoke
370 * record against a buffer_head which is
371 * not marked revoked --- that would
372 * risk missing a subsequent revoke
373 * cancel. */
374 J_ASSERT_BH(bh2, buffer_revoked(bh2));
375 put_bh(bh2);
376 }
377 }
378#endif
379
380 /* We really ought not ever to revoke twice in a row without
381 first having the revoke cancelled: it's illegal to free a
382 block twice without allocating it in between! */
383 if (bh) {
384 if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
385 "inconsistent data on disk")) {
386 if (!bh_in)
387 brelse(bh);
388 return -EIO;
389 }
390 set_buffer_revoked(bh);
391 set_buffer_revokevalid(bh);
392 if (bh_in) {
393 BUFFER_TRACE(bh_in, "call journal_forget");
394 journal_forget(handle, bh_in);
395 } else {
396 BUFFER_TRACE(bh, "call brelse");
397 __brelse(bh);
398 }
399 }
400
401 jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
402 err = insert_revoke_hash(journal, blocknr,
403 handle->h_transaction->t_tid);
404 BUFFER_TRACE(bh_in, "exit");
405 return err;
406}
407
408/*
409 * Cancel an outstanding revoke. For use only internally by the
410 * journaling code (called from journal_get_write_access).
411 *
412 * We trust buffer_revoked() on the buffer if the buffer is already
413 * being journaled: if there is no revoke pending on the buffer, then we
414 * don't do anything here.
415 *
416 * This would break if it were possible for a buffer to be revoked and
417 * discarded, and then reallocated within the same transaction. In such
418 * a case we would have lost the revoked bit, but when we arrived here
419 * the second time we would still have a pending revoke to cancel. So,
420 * do not trust the Revoked bit on buffers unless RevokeValid is also
421 * set.
422 */
423int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
424{
425 struct jbd_revoke_record_s *record;
426 journal_t *journal = handle->h_transaction->t_journal;
427 int need_cancel;
428 int did_revoke = 0; /* akpm: debug */
429 struct buffer_head *bh = jh2bh(jh);
430
431 jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
432
433 /* Is the existing Revoke bit valid? If so, we trust it, and
434 * only perform the full cancel if the revoke bit is set. If
435 * not, we can't trust the revoke bit, and we need to do the
436 * full search for a revoke record. */
437 if (test_set_buffer_revokevalid(bh)) {
438 need_cancel = test_clear_buffer_revoked(bh);
439 } else {
440 need_cancel = 1;
441 clear_buffer_revoked(bh);
442 }
443
444 if (need_cancel) {
445 record = find_revoke_record(journal, bh->b_blocknr);
446 if (record) {
447 jbd_debug(4, "cancelled existing revoke on "
448 "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
449 spin_lock(&journal->j_revoke_lock);
450 list_del(&record->hash);
451 spin_unlock(&journal->j_revoke_lock);
452 kmem_cache_free(revoke_record_cache, record);
453 did_revoke = 1;
454 }
455 }
456
457#ifdef JBD_EXPENSIVE_CHECKING
458 /* There better not be one left behind by now! */
459 record = find_revoke_record(journal, bh->b_blocknr);
460 J_ASSERT_JH(jh, record == NULL);
461#endif
462
463 /* Finally, have we just cleared revoke on an unhashed
464 * buffer_head? If so, we'd better make sure we clear the
465 * revoked status on any hashed alias too, otherwise the revoke
466 * state machine will get very upset later on. */
467 if (need_cancel) {
468 struct buffer_head *bh2;
469 bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
470 if (bh2) {
471 if (bh2 != bh)
472 clear_buffer_revoked(bh2);
473 __brelse(bh2);
474 }
475 }
476 return did_revoke;
477}
478
479/*
480 * journal_clear_revoked_flags clears revoked flag of buffers in
481 * revoke table to reflect there is no revoked buffer in the next
482 * transaction which is going to be started.
483 */
484void journal_clear_buffer_revoked_flags(journal_t *journal)
485{
486 struct jbd_revoke_table_s *revoke = journal->j_revoke;
487 int i = 0;
488
489 for (i = 0; i < revoke->hash_size; i++) {
490 struct list_head *hash_list;
491 struct list_head *list_entry;
492 hash_list = &revoke->hash_table[i];
493
494 list_for_each(list_entry, hash_list) {
495 struct jbd_revoke_record_s *record;
496 struct buffer_head *bh;
497 record = (struct jbd_revoke_record_s *)list_entry;
498 bh = __find_get_block(journal->j_fs_dev,
499 record->blocknr,
500 journal->j_blocksize);
501 if (bh) {
502 clear_buffer_revoked(bh);
503 __brelse(bh);
504 }
505 }
506 }
507}
508
509/* journal_switch_revoke table select j_revoke for next transaction
510 * we do not want to suspend any processing until all revokes are
511 * written -bzzz
512 */
513void journal_switch_revoke_table(journal_t *journal)
514{
515 int i;
516
517 if (journal->j_revoke == journal->j_revoke_table[0])
518 journal->j_revoke = journal->j_revoke_table[1];
519 else
520 journal->j_revoke = journal->j_revoke_table[0];
521
522 for (i = 0; i < journal->j_revoke->hash_size; i++)
523 INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
524}
525
526/*
527 * Write revoke records to the journal for all entries in the current
528 * revoke hash, deleting the entries as we go.
529 */
530void journal_write_revoke_records(journal_t *journal,
531 transaction_t *transaction, int write_op)
532{
533 struct journal_head *descriptor;
534 struct jbd_revoke_record_s *record;
535 struct jbd_revoke_table_s *revoke;
536 struct list_head *hash_list;
537 int i, offset, count;
538
539 descriptor = NULL;
540 offset = 0;
541 count = 0;
542
543 /* select revoke table for committing transaction */
544 revoke = journal->j_revoke == journal->j_revoke_table[0] ?
545 journal->j_revoke_table[1] : journal->j_revoke_table[0];
546
547 for (i = 0; i < revoke->hash_size; i++) {
548 hash_list = &revoke->hash_table[i];
549
550 while (!list_empty(hash_list)) {
551 record = (struct jbd_revoke_record_s *)
552 hash_list->next;
553 write_one_revoke_record(journal, transaction,
554 &descriptor, &offset,
555 record, write_op);
556 count++;
557 list_del(&record->hash);
558 kmem_cache_free(revoke_record_cache, record);
559 }
560 }
561 if (descriptor)
562 flush_descriptor(journal, descriptor, offset, write_op);
563 jbd_debug(1, "Wrote %d revoke records\n", count);
564}
565
566/*
567 * Write out one revoke record. We need to create a new descriptor
568 * block if the old one is full or if we have not already created one.
569 */
570
571static void write_one_revoke_record(journal_t *journal,
572 transaction_t *transaction,
573 struct journal_head **descriptorp,
574 int *offsetp,
575 struct jbd_revoke_record_s *record,
576 int write_op)
577{
578 struct journal_head *descriptor;
579 int offset;
580 journal_header_t *header;
581
582 /* If we are already aborting, this all becomes a noop. We
583 still need to go round the loop in
584 journal_write_revoke_records in order to free all of the
585 revoke records: only the IO to the journal is omitted. */
586 if (is_journal_aborted(journal))
587 return;
588
589 descriptor = *descriptorp;
590 offset = *offsetp;
591
592 /* Make sure we have a descriptor with space left for the record */
593 if (descriptor) {
594 if (offset == journal->j_blocksize) {
595 flush_descriptor(journal, descriptor, offset, write_op);
596 descriptor = NULL;
597 }
598 }
599
600 if (!descriptor) {
601 descriptor = journal_get_descriptor_buffer(journal);
602 if (!descriptor)
603 return;
604 header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
605 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
606 header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK);
607 header->h_sequence = cpu_to_be32(transaction->t_tid);
608
609 /* Record it so that we can wait for IO completion later */
610 JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
611 journal_file_buffer(descriptor, transaction, BJ_LogCtl);
612
613 offset = sizeof(journal_revoke_header_t);
614 *descriptorp = descriptor;
615 }
616
617 * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
618 cpu_to_be32(record->blocknr);
619 offset += 4;
620 *offsetp = offset;
621}
622
623/*
624 * Flush a revoke descriptor out to the journal. If we are aborting,
625 * this is a noop; otherwise we are generating a buffer which needs to
626 * be waited for during commit, so it has to go onto the appropriate
627 * journal buffer list.
628 */
629
630static void flush_descriptor(journal_t *journal,
631 struct journal_head *descriptor,
632 int offset, int write_op)
633{
634 journal_revoke_header_t *header;
635 struct buffer_head *bh = jh2bh(descriptor);
636
637 if (is_journal_aborted(journal)) {
638 put_bh(bh);
639 return;
640 }
641
642 header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
643 header->r_count = cpu_to_be32(offset);
644 set_buffer_jwrite(bh);
645 BUFFER_TRACE(bh, "write");
646 set_buffer_dirty(bh);
647 write_dirty_buffer(bh, write_op);
648}
649#endif
650
651/*
652 * Revoke support for recovery.
653 *
654 * Recovery needs to be able to:
655 *
656 * record all revoke records, including the tid of the latest instance
657 * of each revoke in the journal
658 *
659 * check whether a given block in a given transaction should be replayed
660 * (ie. has not been revoked by a revoke record in that or a subsequent
661 * transaction)
662 *
663 * empty the revoke table after recovery.
664 */
665
666/*
667 * First, setting revoke records. We create a new revoke record for
668 * every block ever revoked in the log as we scan it for recovery, and
669 * we update the existing records if we find multiple revokes for a
670 * single block.
671 */
672
673int journal_set_revoke(journal_t *journal,
674 unsigned int blocknr,
675 tid_t sequence)
676{
677 struct jbd_revoke_record_s *record;
678
679 record = find_revoke_record(journal, blocknr);
680 if (record) {
681 /* If we have multiple occurrences, only record the
682 * latest sequence number in the hashed record */
683 if (tid_gt(sequence, record->sequence))
684 record->sequence = sequence;
685 return 0;
686 }
687 return insert_revoke_hash(journal, blocknr, sequence);
688}
689
690/*
691 * Test revoke records. For a given block referenced in the log, has
692 * that block been revoked? A revoke record with a given transaction
693 * sequence number revokes all blocks in that transaction and earlier
694 * ones, but later transactions still need replayed.
695 */
696
697int journal_test_revoke(journal_t *journal,
698 unsigned int blocknr,
699 tid_t sequence)
700{
701 struct jbd_revoke_record_s *record;
702
703 record = find_revoke_record(journal, blocknr);
704 if (!record)
705 return 0;
706 if (tid_gt(sequence, record->sequence))
707 return 0;
708 return 1;
709}
710
711/*
712 * Finally, once recovery is over, we need to clear the revoke table so
713 * that it can be reused by the running filesystem.
714 */
715
716void journal_clear_revoke(journal_t *journal)
717{
718 int i;
719 struct list_head *hash_list;
720 struct jbd_revoke_record_s *record;
721 struct jbd_revoke_table_s *revoke;
722
723 revoke = journal->j_revoke;
724
725 for (i = 0; i < revoke->hash_size; i++) {
726 hash_list = &revoke->hash_table[i];
727 while (!list_empty(hash_list)) {
728 record = (struct jbd_revoke_record_s*) hash_list->next;
729 list_del(&record->hash);
730 kmem_cache_free(revoke_record_cache, record);
731 }
732 }
733}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
deleted file mode 100644
index 1695ba8334a2..000000000000
--- a/fs/jbd/transaction.c
+++ /dev/null
@@ -1,2237 +0,0 @@
1/*
2 * linux/fs/jbd/transaction.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Generic filesystem transaction handling code; part of the ext2fs
13 * journaling system.
14 *
15 * This file manages transactions (compound commits managed by the
16 * journaling code) and handles (individual atomic operations by the
17 * filesystem).
18 */
19
20#include <linux/time.h>
21#include <linux/fs.h>
22#include <linux/jbd.h>
23#include <linux/errno.h>
24#include <linux/slab.h>
25#include <linux/timer.h>
26#include <linux/mm.h>
27#include <linux/highmem.h>
28#include <linux/hrtimer.h>
29
30static void __journal_temp_unlink_buffer(struct journal_head *jh);
31
32/*
33 * get_transaction: obtain a new transaction_t object.
34 *
35 * Simply allocate and initialise a new transaction. Create it in
36 * RUNNING state and add it to the current journal (which should not
37 * have an existing running transaction: we only make a new transaction
38 * once we have started to commit the old one).
39 *
40 * Preconditions:
41 * The journal MUST be locked. We don't perform atomic mallocs on the
42 * new transaction and we can't block without protecting against other
43 * processes trying to touch the journal while it is in transition.
44 *
45 * Called under j_state_lock
46 */
47
48static transaction_t *
49get_transaction(journal_t *journal, transaction_t *transaction)
50{
51 transaction->t_journal = journal;
52 transaction->t_state = T_RUNNING;
53 transaction->t_start_time = ktime_get();
54 transaction->t_tid = journal->j_transaction_sequence++;
55 transaction->t_expires = jiffies + journal->j_commit_interval;
56 spin_lock_init(&transaction->t_handle_lock);
57
58 /* Set up the commit timer for the new transaction. */
59 journal->j_commit_timer.expires =
60 round_jiffies_up(transaction->t_expires);
61 add_timer(&journal->j_commit_timer);
62
63 J_ASSERT(journal->j_running_transaction == NULL);
64 journal->j_running_transaction = transaction;
65
66 return transaction;
67}
68
69/*
70 * Handle management.
71 *
72 * A handle_t is an object which represents a single atomic update to a
73 * filesystem, and which tracks all of the modifications which form part
74 * of that one update.
75 */
76
77/*
78 * start_this_handle: Given a handle, deal with any locking or stalling
79 * needed to make sure that there is enough journal space for the handle
80 * to begin. Attach the handle to a transaction and set up the
81 * transaction's buffer credits.
82 */
83
84static int start_this_handle(journal_t *journal, handle_t *handle)
85{
86 transaction_t *transaction;
87 int needed;
88 int nblocks = handle->h_buffer_credits;
89 transaction_t *new_transaction = NULL;
90 int ret = 0;
91
92 if (nblocks > journal->j_max_transaction_buffers) {
93 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
94 current->comm, nblocks,
95 journal->j_max_transaction_buffers);
96 ret = -ENOSPC;
97 goto out;
98 }
99
100alloc_transaction:
101 if (!journal->j_running_transaction) {
102 new_transaction = kzalloc(sizeof(*new_transaction),
103 GFP_NOFS|__GFP_NOFAIL);
104 if (!new_transaction) {
105 ret = -ENOMEM;
106 goto out;
107 }
108 }
109
110 jbd_debug(3, "New handle %p going live.\n", handle);
111
112repeat:
113
114 /*
115 * We need to hold j_state_lock until t_updates has been incremented,
116 * for proper journal barrier handling
117 */
118 spin_lock(&journal->j_state_lock);
119repeat_locked:
120 if (is_journal_aborted(journal) ||
121 (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
122 spin_unlock(&journal->j_state_lock);
123 ret = -EROFS;
124 goto out;
125 }
126
127 /* Wait on the journal's transaction barrier if necessary */
128 if (journal->j_barrier_count) {
129 spin_unlock(&journal->j_state_lock);
130 wait_event(journal->j_wait_transaction_locked,
131 journal->j_barrier_count == 0);
132 goto repeat;
133 }
134
135 if (!journal->j_running_transaction) {
136 if (!new_transaction) {
137 spin_unlock(&journal->j_state_lock);
138 goto alloc_transaction;
139 }
140 get_transaction(journal, new_transaction);
141 new_transaction = NULL;
142 }
143
144 transaction = journal->j_running_transaction;
145
146 /*
147 * If the current transaction is locked down for commit, wait for the
148 * lock to be released.
149 */
150 if (transaction->t_state == T_LOCKED) {
151 DEFINE_WAIT(wait);
152
153 prepare_to_wait(&journal->j_wait_transaction_locked,
154 &wait, TASK_UNINTERRUPTIBLE);
155 spin_unlock(&journal->j_state_lock);
156 schedule();
157 finish_wait(&journal->j_wait_transaction_locked, &wait);
158 goto repeat;
159 }
160
161 /*
162 * If there is not enough space left in the log to write all potential
163 * buffers requested by this operation, we need to stall pending a log
164 * checkpoint to free some more log space.
165 */
166 spin_lock(&transaction->t_handle_lock);
167 needed = transaction->t_outstanding_credits + nblocks;
168
169 if (needed > journal->j_max_transaction_buffers) {
170 /*
171 * If the current transaction is already too large, then start
172 * to commit it: we can then go back and attach this handle to
173 * a new transaction.
174 */
175 DEFINE_WAIT(wait);
176
177 jbd_debug(2, "Handle %p starting new commit...\n", handle);
178 spin_unlock(&transaction->t_handle_lock);
179 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
180 TASK_UNINTERRUPTIBLE);
181 __log_start_commit(journal, transaction->t_tid);
182 spin_unlock(&journal->j_state_lock);
183 schedule();
184 finish_wait(&journal->j_wait_transaction_locked, &wait);
185 goto repeat;
186 }
187
188 /*
189 * The commit code assumes that it can get enough log space
190 * without forcing a checkpoint. This is *critical* for
191 * correctness: a checkpoint of a buffer which is also
192 * associated with a committing transaction creates a deadlock,
193 * so commit simply cannot force through checkpoints.
194 *
195 * We must therefore ensure the necessary space in the journal
196 * *before* starting to dirty potentially checkpointed buffers
197 * in the new transaction.
198 *
199 * The worst part is, any transaction currently committing can
200 * reduce the free space arbitrarily. Be careful to account for
201 * those buffers when checkpointing.
202 */
203
204 /*
205 * @@@ AKPM: This seems rather over-defensive. We're giving commit
206 * a _lot_ of headroom: 1/4 of the journal plus the size of
207 * the committing transaction. Really, we only need to give it
208 * committing_transaction->t_outstanding_credits plus "enough" for
209 * the log control blocks.
210 * Also, this test is inconsistent with the matching one in
211 * journal_extend().
212 */
213 if (__log_space_left(journal) < jbd_space_needed(journal)) {
214 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
215 spin_unlock(&transaction->t_handle_lock);
216 __log_wait_for_space(journal);
217 goto repeat_locked;
218 }
219
220 /* OK, account for the buffers that this operation expects to
221 * use and add the handle to the running transaction. */
222
223 handle->h_transaction = transaction;
224 transaction->t_outstanding_credits += nblocks;
225 transaction->t_updates++;
226 transaction->t_handle_count++;
227 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
228 handle, nblocks, transaction->t_outstanding_credits,
229 __log_space_left(journal));
230 spin_unlock(&transaction->t_handle_lock);
231 spin_unlock(&journal->j_state_lock);
232
233 lock_map_acquire(&handle->h_lockdep_map);
234out:
235 if (unlikely(new_transaction)) /* It's usually NULL */
236 kfree(new_transaction);
237 return ret;
238}
239
240static struct lock_class_key jbd_handle_key;
241
242/* Allocate a new handle. This should probably be in a slab... */
243static handle_t *new_handle(int nblocks)
244{
245 handle_t *handle = jbd_alloc_handle(GFP_NOFS);
246 if (!handle)
247 return NULL;
248 handle->h_buffer_credits = nblocks;
249 handle->h_ref = 1;
250
251 lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0);
252
253 return handle;
254}
255
256/**
257 * handle_t *journal_start() - Obtain a new handle.
258 * @journal: Journal to start transaction on.
259 * @nblocks: number of block buffer we might modify
260 *
261 * We make sure that the transaction can guarantee at least nblocks of
262 * modified buffers in the log. We block until the log can guarantee
263 * that much space.
264 *
265 * This function is visible to journal users (like ext3fs), so is not
266 * called with the journal already locked.
267 *
268 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
269 * on failure.
270 */
271handle_t *journal_start(journal_t *journal, int nblocks)
272{
273 handle_t *handle = journal_current_handle();
274 int err;
275
276 if (!journal)
277 return ERR_PTR(-EROFS);
278
279 if (handle) {
280 J_ASSERT(handle->h_transaction->t_journal == journal);
281 handle->h_ref++;
282 return handle;
283 }
284
285 handle = new_handle(nblocks);
286 if (!handle)
287 return ERR_PTR(-ENOMEM);
288
289 current->journal_info = handle;
290
291 err = start_this_handle(journal, handle);
292 if (err < 0) {
293 jbd_free_handle(handle);
294 current->journal_info = NULL;
295 handle = ERR_PTR(err);
296 }
297 return handle;
298}
299
300/**
301 * int journal_extend() - extend buffer credits.
302 * @handle: handle to 'extend'
303 * @nblocks: nr blocks to try to extend by.
304 *
305 * Some transactions, such as large extends and truncates, can be done
306 * atomically all at once or in several stages. The operation requests
307 * a credit for a number of buffer modications in advance, but can
308 * extend its credit if it needs more.
309 *
310 * journal_extend tries to give the running handle more buffer credits.
311 * It does not guarantee that allocation - this is a best-effort only.
312 * The calling process MUST be able to deal cleanly with a failure to
313 * extend here.
314 *
315 * Return 0 on success, non-zero on failure.
316 *
317 * return code < 0 implies an error
318 * return code > 0 implies normal transaction-full status.
319 */
320int journal_extend(handle_t *handle, int nblocks)
321{
322 transaction_t *transaction = handle->h_transaction;
323 journal_t *journal = transaction->t_journal;
324 int result;
325 int wanted;
326
327 result = -EIO;
328 if (is_handle_aborted(handle))
329 goto out;
330
331 result = 1;
332
333 spin_lock(&journal->j_state_lock);
334
335 /* Don't extend a locked-down transaction! */
336 if (handle->h_transaction->t_state != T_RUNNING) {
337 jbd_debug(3, "denied handle %p %d blocks: "
338 "transaction not running\n", handle, nblocks);
339 goto error_out;
340 }
341
342 spin_lock(&transaction->t_handle_lock);
343 wanted = transaction->t_outstanding_credits + nblocks;
344
345 if (wanted > journal->j_max_transaction_buffers) {
346 jbd_debug(3, "denied handle %p %d blocks: "
347 "transaction too large\n", handle, nblocks);
348 goto unlock;
349 }
350
351 if (wanted > __log_space_left(journal)) {
352 jbd_debug(3, "denied handle %p %d blocks: "
353 "insufficient log space\n", handle, nblocks);
354 goto unlock;
355 }
356
357 handle->h_buffer_credits += nblocks;
358 transaction->t_outstanding_credits += nblocks;
359 result = 0;
360
361 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
362unlock:
363 spin_unlock(&transaction->t_handle_lock);
364error_out:
365 spin_unlock(&journal->j_state_lock);
366out:
367 return result;
368}
369
370
371/**
372 * int journal_restart() - restart a handle.
373 * @handle: handle to restart
374 * @nblocks: nr credits requested
375 *
376 * Restart a handle for a multi-transaction filesystem
377 * operation.
378 *
379 * If the journal_extend() call above fails to grant new buffer credits
380 * to a running handle, a call to journal_restart will commit the
381 * handle's transaction so far and reattach the handle to a new
382 * transaction capabable of guaranteeing the requested number of
383 * credits.
384 */
385
386int journal_restart(handle_t *handle, int nblocks)
387{
388 transaction_t *transaction = handle->h_transaction;
389 journal_t *journal = transaction->t_journal;
390 int ret;
391
392 /* If we've had an abort of any type, don't even think about
393 * actually doing the restart! */
394 if (is_handle_aborted(handle))
395 return 0;
396
397 /*
398 * First unlink the handle from its current transaction, and start the
399 * commit on that.
400 */
401 J_ASSERT(transaction->t_updates > 0);
402 J_ASSERT(journal_current_handle() == handle);
403
404 spin_lock(&journal->j_state_lock);
405 spin_lock(&transaction->t_handle_lock);
406 transaction->t_outstanding_credits -= handle->h_buffer_credits;
407 transaction->t_updates--;
408
409 if (!transaction->t_updates)
410 wake_up(&journal->j_wait_updates);
411 spin_unlock(&transaction->t_handle_lock);
412
413 jbd_debug(2, "restarting handle %p\n", handle);
414 __log_start_commit(journal, transaction->t_tid);
415 spin_unlock(&journal->j_state_lock);
416
417 lock_map_release(&handle->h_lockdep_map);
418 handle->h_buffer_credits = nblocks;
419 ret = start_this_handle(journal, handle);
420 return ret;
421}
422
423
424/**
425 * void journal_lock_updates () - establish a transaction barrier.
426 * @journal: Journal to establish a barrier on.
427 *
428 * This locks out any further updates from being started, and blocks until all
429 * existing updates have completed, returning only once the journal is in a
430 * quiescent state with no updates running.
431 *
432 * We do not use simple mutex for synchronization as there are syscalls which
433 * want to return with filesystem locked and that trips up lockdep. Also
434 * hibernate needs to lock filesystem but locked mutex then blocks hibernation.
435 * Since locking filesystem is rare operation, we use simple counter and
436 * waitqueue for locking.
437 */
438void journal_lock_updates(journal_t *journal)
439{
440 DEFINE_WAIT(wait);
441
442wait:
443 /* Wait for previous locked operation to finish */
444 wait_event(journal->j_wait_transaction_locked,
445 journal->j_barrier_count == 0);
446
447 spin_lock(&journal->j_state_lock);
448 /*
449 * Check reliably under the lock whether we are the ones winning the race
450 * and locking the journal
451 */
452 if (journal->j_barrier_count > 0) {
453 spin_unlock(&journal->j_state_lock);
454 goto wait;
455 }
456 ++journal->j_barrier_count;
457
458 /* Wait until there are no running updates */
459 while (1) {
460 transaction_t *transaction = journal->j_running_transaction;
461
462 if (!transaction)
463 break;
464
465 spin_lock(&transaction->t_handle_lock);
466 if (!transaction->t_updates) {
467 spin_unlock(&transaction->t_handle_lock);
468 break;
469 }
470 prepare_to_wait(&journal->j_wait_updates, &wait,
471 TASK_UNINTERRUPTIBLE);
472 spin_unlock(&transaction->t_handle_lock);
473 spin_unlock(&journal->j_state_lock);
474 schedule();
475 finish_wait(&journal->j_wait_updates, &wait);
476 spin_lock(&journal->j_state_lock);
477 }
478 spin_unlock(&journal->j_state_lock);
479}
480
481/**
482 * void journal_unlock_updates (journal_t* journal) - release barrier
483 * @journal: Journal to release the barrier on.
484 *
485 * Release a transaction barrier obtained with journal_lock_updates().
486 */
487void journal_unlock_updates (journal_t *journal)
488{
489 J_ASSERT(journal->j_barrier_count != 0);
490
491 spin_lock(&journal->j_state_lock);
492 --journal->j_barrier_count;
493 spin_unlock(&journal->j_state_lock);
494 wake_up(&journal->j_wait_transaction_locked);
495}
496
497static void warn_dirty_buffer(struct buffer_head *bh)
498{
499 char b[BDEVNAME_SIZE];
500
501 printk(KERN_WARNING
502 "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
503 "There's a risk of filesystem corruption in case of system "
504 "crash.\n",
505 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
506}
507
508/*
509 * If the buffer is already part of the current transaction, then there
510 * is nothing we need to do. If it is already part of a prior
511 * transaction which we are still committing to disk, then we need to
512 * make sure that we do not overwrite the old copy: we do copy-out to
513 * preserve the copy going to disk. We also account the buffer against
514 * the handle's metadata buffer credits (unless the buffer is already
515 * part of the transaction, that is).
516 *
517 */
518static int
519do_get_write_access(handle_t *handle, struct journal_head *jh,
520 int force_copy)
521{
522 struct buffer_head *bh;
523 transaction_t *transaction;
524 journal_t *journal;
525 int error;
526 char *frozen_buffer = NULL;
527 int need_copy = 0;
528
529 if (is_handle_aborted(handle))
530 return -EROFS;
531
532 transaction = handle->h_transaction;
533 journal = transaction->t_journal;
534
535 jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
536
537 JBUFFER_TRACE(jh, "entry");
538repeat:
539 bh = jh2bh(jh);
540
541 /* @@@ Need to check for errors here at some point. */
542
543 lock_buffer(bh);
544 jbd_lock_bh_state(bh);
545
546 /* We now hold the buffer lock so it is safe to query the buffer
547 * state. Is the buffer dirty?
548 *
549 * If so, there are two possibilities. The buffer may be
550 * non-journaled, and undergoing a quite legitimate writeback.
551 * Otherwise, it is journaled, and we don't expect dirty buffers
552 * in that state (the buffers should be marked JBD_Dirty
553 * instead.) So either the IO is being done under our own
554 * control and this is a bug, or it's a third party IO such as
555 * dump(8) (which may leave the buffer scheduled for read ---
556 * ie. locked but not dirty) or tune2fs (which may actually have
557 * the buffer dirtied, ugh.) */
558
559 if (buffer_dirty(bh)) {
560 /*
561 * First question: is this buffer already part of the current
562 * transaction or the existing committing transaction?
563 */
564 if (jh->b_transaction) {
565 J_ASSERT_JH(jh,
566 jh->b_transaction == transaction ||
567 jh->b_transaction ==
568 journal->j_committing_transaction);
569 if (jh->b_next_transaction)
570 J_ASSERT_JH(jh, jh->b_next_transaction ==
571 transaction);
572 warn_dirty_buffer(bh);
573 }
574 /*
575 * In any case we need to clean the dirty flag and we must
576 * do it under the buffer lock to be sure we don't race
577 * with running write-out.
578 */
579 JBUFFER_TRACE(jh, "Journalling dirty buffer");
580 clear_buffer_dirty(bh);
581 set_buffer_jbddirty(bh);
582 }
583
584 unlock_buffer(bh);
585
586 error = -EROFS;
587 if (is_handle_aborted(handle)) {
588 jbd_unlock_bh_state(bh);
589 goto out;
590 }
591 error = 0;
592
593 /*
594 * The buffer is already part of this transaction if b_transaction or
595 * b_next_transaction points to it
596 */
597 if (jh->b_transaction == transaction ||
598 jh->b_next_transaction == transaction)
599 goto done;
600
601 /*
602 * this is the first time this transaction is touching this buffer,
603 * reset the modified flag
604 */
605 jh->b_modified = 0;
606
607 /*
608 * If there is already a copy-out version of this buffer, then we don't
609 * need to make another one
610 */
611 if (jh->b_frozen_data) {
612 JBUFFER_TRACE(jh, "has frozen data");
613 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
614 jh->b_next_transaction = transaction;
615 goto done;
616 }
617
618 /* Is there data here we need to preserve? */
619
620 if (jh->b_transaction && jh->b_transaction != transaction) {
621 JBUFFER_TRACE(jh, "owned by older transaction");
622 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
623 J_ASSERT_JH(jh, jh->b_transaction ==
624 journal->j_committing_transaction);
625
626 /* There is one case we have to be very careful about.
627 * If the committing transaction is currently writing
628 * this buffer out to disk and has NOT made a copy-out,
629 * then we cannot modify the buffer contents at all
630 * right now. The essence of copy-out is that it is the
631 * extra copy, not the primary copy, which gets
632 * journaled. If the primary copy is already going to
633 * disk then we cannot do copy-out here. */
634
635 if (jh->b_jlist == BJ_Shadow) {
636 DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
637 wait_queue_head_t *wqh;
638
639 wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
640
641 JBUFFER_TRACE(jh, "on shadow: sleep");
642 jbd_unlock_bh_state(bh);
643 /* commit wakes up all shadow buffers after IO */
644 for ( ; ; ) {
645 prepare_to_wait(wqh, &wait.wait,
646 TASK_UNINTERRUPTIBLE);
647 if (jh->b_jlist != BJ_Shadow)
648 break;
649 schedule();
650 }
651 finish_wait(wqh, &wait.wait);
652 goto repeat;
653 }
654
655 /* Only do the copy if the currently-owning transaction
656 * still needs it. If it is on the Forget list, the
657 * committing transaction is past that stage. The
658 * buffer had better remain locked during the kmalloc,
659 * but that should be true --- we hold the journal lock
660 * still and the buffer is already on the BUF_JOURNAL
661 * list so won't be flushed.
662 *
663 * Subtle point, though: if this is a get_undo_access,
664 * then we will be relying on the frozen_data to contain
665 * the new value of the committed_data record after the
666 * transaction, so we HAVE to force the frozen_data copy
667 * in that case. */
668
669 if (jh->b_jlist != BJ_Forget || force_copy) {
670 JBUFFER_TRACE(jh, "generate frozen data");
671 if (!frozen_buffer) {
672 JBUFFER_TRACE(jh, "allocate memory for buffer");
673 jbd_unlock_bh_state(bh);
674 frozen_buffer =
675 jbd_alloc(jh2bh(jh)->b_size,
676 GFP_NOFS);
677 if (!frozen_buffer) {
678 printk(KERN_ERR
679 "%s: OOM for frozen_buffer\n",
680 __func__);
681 JBUFFER_TRACE(jh, "oom!");
682 error = -ENOMEM;
683 jbd_lock_bh_state(bh);
684 goto done;
685 }
686 goto repeat;
687 }
688 jh->b_frozen_data = frozen_buffer;
689 frozen_buffer = NULL;
690 need_copy = 1;
691 }
692 jh->b_next_transaction = transaction;
693 }
694
695
696 /*
697 * Finally, if the buffer is not journaled right now, we need to make
698 * sure it doesn't get written to disk before the caller actually
699 * commits the new data
700 */
701 if (!jh->b_transaction) {
702 JBUFFER_TRACE(jh, "no transaction");
703 J_ASSERT_JH(jh, !jh->b_next_transaction);
704 JBUFFER_TRACE(jh, "file as BJ_Reserved");
705 spin_lock(&journal->j_list_lock);
706 __journal_file_buffer(jh, transaction, BJ_Reserved);
707 spin_unlock(&journal->j_list_lock);
708 }
709
710done:
711 if (need_copy) {
712 struct page *page;
713 int offset;
714 char *source;
715
716 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
717 "Possible IO failure.\n");
718 page = jh2bh(jh)->b_page;
719 offset = offset_in_page(jh2bh(jh)->b_data);
720 source = kmap_atomic(page);
721 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
722 kunmap_atomic(source);
723 }
724 jbd_unlock_bh_state(bh);
725
726 /*
727 * If we are about to journal a buffer, then any revoke pending on it is
728 * no longer valid
729 */
730 journal_cancel_revoke(handle, jh);
731
732out:
733 if (unlikely(frozen_buffer)) /* It's usually NULL */
734 jbd_free(frozen_buffer, bh->b_size);
735
736 JBUFFER_TRACE(jh, "exit");
737 return error;
738}
739
740/**
741 * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
742 * @handle: transaction to add buffer modifications to
743 * @bh: bh to be used for metadata writes
744 *
745 * Returns an error code or 0 on success.
746 *
747 * In full data journalling mode the buffer may be of type BJ_AsyncData,
748 * because we're write()ing a buffer which is also part of a shared mapping.
749 */
750
751int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
752{
753 struct journal_head *jh = journal_add_journal_head(bh);
754 int rc;
755
756 /* We do not want to get caught playing with fields which the
757 * log thread also manipulates. Make sure that the buffer
758 * completes any outstanding IO before proceeding. */
759 rc = do_get_write_access(handle, jh, 0);
760 journal_put_journal_head(jh);
761 return rc;
762}
763
764
765/*
766 * When the user wants to journal a newly created buffer_head
767 * (ie. getblk() returned a new buffer and we are going to populate it
768 * manually rather than reading off disk), then we need to keep the
769 * buffer_head locked until it has been completely filled with new
770 * data. In this case, we should be able to make the assertion that
771 * the bh is not already part of an existing transaction.
772 *
773 * The buffer should already be locked by the caller by this point.
774 * There is no lock ranking violation: it was a newly created,
775 * unlocked buffer beforehand. */
776
777/**
778 * int journal_get_create_access () - notify intent to use newly created bh
779 * @handle: transaction to new buffer to
780 * @bh: new buffer.
781 *
782 * Call this if you create a new bh.
783 */
784int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
785{
786 transaction_t *transaction = handle->h_transaction;
787 journal_t *journal = transaction->t_journal;
788 struct journal_head *jh = journal_add_journal_head(bh);
789 int err;
790
791 jbd_debug(5, "journal_head %p\n", jh);
792 err = -EROFS;
793 if (is_handle_aborted(handle))
794 goto out;
795 err = 0;
796
797 JBUFFER_TRACE(jh, "entry");
798 /*
799 * The buffer may already belong to this transaction due to pre-zeroing
800 * in the filesystem's new_block code. It may also be on the previous,
801 * committing transaction's lists, but it HAS to be in Forget state in
802 * that case: the transaction must have deleted the buffer for it to be
803 * reused here.
804 */
805 jbd_lock_bh_state(bh);
806 spin_lock(&journal->j_list_lock);
807 J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
808 jh->b_transaction == NULL ||
809 (jh->b_transaction == journal->j_committing_transaction &&
810 jh->b_jlist == BJ_Forget)));
811
812 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
813 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
814
815 if (jh->b_transaction == NULL) {
816 /*
817 * Previous journal_forget() could have left the buffer
818 * with jbddirty bit set because it was being committed. When
819 * the commit finished, we've filed the buffer for
820 * checkpointing and marked it dirty. Now we are reallocating
821 * the buffer so the transaction freeing it must have
822 * committed and so it's safe to clear the dirty bit.
823 */
824 clear_buffer_dirty(jh2bh(jh));
825
826 /* first access by this transaction */
827 jh->b_modified = 0;
828
829 JBUFFER_TRACE(jh, "file as BJ_Reserved");
830 __journal_file_buffer(jh, transaction, BJ_Reserved);
831 } else if (jh->b_transaction == journal->j_committing_transaction) {
832 /* first access by this transaction */
833 jh->b_modified = 0;
834
835 JBUFFER_TRACE(jh, "set next transaction");
836 jh->b_next_transaction = transaction;
837 }
838 spin_unlock(&journal->j_list_lock);
839 jbd_unlock_bh_state(bh);
840
841 /*
842 * akpm: I added this. ext3_alloc_branch can pick up new indirect
843 * blocks which contain freed but then revoked metadata. We need
844 * to cancel the revoke in case we end up freeing it yet again
845 * and the reallocating as data - this would cause a second revoke,
846 * which hits an assertion error.
847 */
848 JBUFFER_TRACE(jh, "cancelling revoke");
849 journal_cancel_revoke(handle, jh);
850out:
851 journal_put_journal_head(jh);
852 return err;
853}
854
855/**
856 * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences
857 * @handle: transaction
858 * @bh: buffer to undo
859 *
860 * Sometimes there is a need to distinguish between metadata which has
861 * been committed to disk and that which has not. The ext3fs code uses
862 * this for freeing and allocating space, we have to make sure that we
863 * do not reuse freed space until the deallocation has been committed,
864 * since if we overwrote that space we would make the delete
865 * un-rewindable in case of a crash.
866 *
867 * To deal with that, journal_get_undo_access requests write access to a
868 * buffer for parts of non-rewindable operations such as delete
869 * operations on the bitmaps. The journaling code must keep a copy of
870 * the buffer's contents prior to the undo_access call until such time
871 * as we know that the buffer has definitely been committed to disk.
872 *
873 * We never need to know which transaction the committed data is part
874 * of, buffers touched here are guaranteed to be dirtied later and so
875 * will be committed to a new transaction in due course, at which point
876 * we can discard the old committed data pointer.
877 *
878 * Returns error number or 0 on success.
879 */
880int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
881{
882 int err;
883 struct journal_head *jh = journal_add_journal_head(bh);
884 char *committed_data = NULL;
885
886 JBUFFER_TRACE(jh, "entry");
887
888 /*
889 * Do this first --- it can drop the journal lock, so we want to
890 * make sure that obtaining the committed_data is done
891 * atomically wrt. completion of any outstanding commits.
892 */
893 err = do_get_write_access(handle, jh, 1);
894 if (err)
895 goto out;
896
897repeat:
898 if (!jh->b_committed_data) {
899 committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS);
900 if (!committed_data) {
901 printk(KERN_ERR "%s: No memory for committed data\n",
902 __func__);
903 err = -ENOMEM;
904 goto out;
905 }
906 }
907
908 jbd_lock_bh_state(bh);
909 if (!jh->b_committed_data) {
910 /* Copy out the current buffer contents into the
911 * preserved, committed copy. */
912 JBUFFER_TRACE(jh, "generate b_committed data");
913 if (!committed_data) {
914 jbd_unlock_bh_state(bh);
915 goto repeat;
916 }
917
918 jh->b_committed_data = committed_data;
919 committed_data = NULL;
920 memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
921 }
922 jbd_unlock_bh_state(bh);
923out:
924 journal_put_journal_head(jh);
925 if (unlikely(committed_data))
926 jbd_free(committed_data, bh->b_size);
927 return err;
928}
929
930/**
931 * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed
932 * @handle: transaction
933 * @bh: bufferhead to mark
934 *
935 * Description:
936 * Mark a buffer as containing dirty data which needs to be flushed before
937 * we can commit the current transaction.
938 *
939 * The buffer is placed on the transaction's data list and is marked as
940 * belonging to the transaction.
941 *
942 * Returns error number or 0 on success.
943 *
944 * journal_dirty_data() can be called via page_launder->ext3_writepage
945 * by kswapd.
946 */
947int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
948{
949 journal_t *journal = handle->h_transaction->t_journal;
950 int need_brelse = 0;
951 struct journal_head *jh;
952 int ret = 0;
953
954 if (is_handle_aborted(handle))
955 return ret;
956
957 jh = journal_add_journal_head(bh);
958 JBUFFER_TRACE(jh, "entry");
959
960 /*
961 * The buffer could *already* be dirty. Writeout can start
962 * at any time.
963 */
964 jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
965
966 /*
967 * What if the buffer is already part of a running transaction?
968 *
969 * There are two cases:
970 * 1) It is part of the current running transaction. Refile it,
971 * just in case we have allocated it as metadata, deallocated
972 * it, then reallocated it as data.
973 * 2) It is part of the previous, still-committing transaction.
974 * If all we want to do is to guarantee that the buffer will be
975 * written to disk before this new transaction commits, then
976 * being sure that the *previous* transaction has this same
977 * property is sufficient for us! Just leave it on its old
978 * transaction.
979 *
980 * In case (2), the buffer must not already exist as metadata
981 * --- that would violate write ordering (a transaction is free
982 * to write its data at any point, even before the previous
983 * committing transaction has committed). The caller must
984 * never, ever allow this to happen: there's nothing we can do
985 * about it in this layer.
986 */
987 jbd_lock_bh_state(bh);
988 spin_lock(&journal->j_list_lock);
989
990 /* Now that we have bh_state locked, are we really still mapped? */
991 if (!buffer_mapped(bh)) {
992 JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
993 goto no_journal;
994 }
995
996 if (jh->b_transaction) {
997 JBUFFER_TRACE(jh, "has transaction");
998 if (jh->b_transaction != handle->h_transaction) {
999 JBUFFER_TRACE(jh, "belongs to older transaction");
1000 J_ASSERT_JH(jh, jh->b_transaction ==
1001 journal->j_committing_transaction);
1002
1003 /* @@@ IS THIS TRUE ? */
1004 /*
1005 * Not any more. Scenario: someone does a write()
1006 * in data=journal mode. The buffer's transaction has
1007 * moved into commit. Then someone does another
1008 * write() to the file. We do the frozen data copyout
1009 * and set b_next_transaction to point to j_running_t.
1010 * And while we're in that state, someone does a
1011 * writepage() in an attempt to pageout the same area
1012 * of the file via a shared mapping. At present that
1013 * calls journal_dirty_data(), and we get right here.
1014 * It may be too late to journal the data. Simply
1015 * falling through to the next test will suffice: the
1016 * data will be dirty and wil be checkpointed. The
1017 * ordering comments in the next comment block still
1018 * apply.
1019 */
1020 //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1021
1022 /*
1023 * If we're journalling data, and this buffer was
1024 * subject to a write(), it could be metadata, forget
1025 * or shadow against the committing transaction. Now,
1026 * someone has dirtied the same darn page via a mapping
1027 * and it is being writepage()'d.
1028 * We *could* just steal the page from commit, with some
1029 * fancy locking there. Instead, we just skip it -
1030 * don't tie the page's buffers to the new transaction
1031 * at all.
1032 * Implication: if we crash before the writepage() data
1033 * is written into the filesystem, recovery will replay
1034 * the write() data.
1035 */
1036 if (jh->b_jlist != BJ_None &&
1037 jh->b_jlist != BJ_SyncData &&
1038 jh->b_jlist != BJ_Locked) {
1039 JBUFFER_TRACE(jh, "Not stealing");
1040 goto no_journal;
1041 }
1042
1043 /*
1044 * This buffer may be undergoing writeout in commit. We
1045 * can't return from here and let the caller dirty it
1046 * again because that can cause the write-out loop in
1047 * commit to never terminate.
1048 */
1049 if (buffer_dirty(bh)) {
1050 get_bh(bh);
1051 spin_unlock(&journal->j_list_lock);
1052 jbd_unlock_bh_state(bh);
1053 need_brelse = 1;
1054 sync_dirty_buffer(bh);
1055 jbd_lock_bh_state(bh);
1056 spin_lock(&journal->j_list_lock);
1057 /* Since we dropped the lock... */
1058 if (!buffer_mapped(bh)) {
1059 JBUFFER_TRACE(jh, "buffer got unmapped");
1060 goto no_journal;
1061 }
1062 /* The buffer may become locked again at any
1063 time if it is redirtied */
1064 }
1065
1066 /*
1067 * We cannot remove the buffer with io error from the
1068 * committing transaction, because otherwise it would
1069 * miss the error and the commit would not abort.
1070 */
1071 if (unlikely(!buffer_uptodate(bh))) {
1072 ret = -EIO;
1073 goto no_journal;
1074 }
1075 /* We might have slept so buffer could be refiled now */
1076 if (jh->b_transaction != NULL &&
1077 jh->b_transaction != handle->h_transaction) {
1078 JBUFFER_TRACE(jh, "unfile from commit");
1079 __journal_temp_unlink_buffer(jh);
1080 /* It still points to the committing
1081 * transaction; move it to this one so
1082 * that the refile assert checks are
1083 * happy. */
1084 jh->b_transaction = handle->h_transaction;
1085 }
1086 /* The buffer will be refiled below */
1087
1088 }
1089 /*
1090 * Special case --- the buffer might actually have been
1091 * allocated and then immediately deallocated in the previous,
1092 * committing transaction, so might still be left on that
1093 * transaction's metadata lists.
1094 */
1095 if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
1096 JBUFFER_TRACE(jh, "not on correct data list: unfile");
1097 J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
1098 JBUFFER_TRACE(jh, "file as data");
1099 __journal_file_buffer(jh, handle->h_transaction,
1100 BJ_SyncData);
1101 }
1102 } else {
1103 JBUFFER_TRACE(jh, "not on a transaction");
1104 __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
1105 }
1106no_journal:
1107 spin_unlock(&journal->j_list_lock);
1108 jbd_unlock_bh_state(bh);
1109 if (need_brelse) {
1110 BUFFER_TRACE(bh, "brelse");
1111 __brelse(bh);
1112 }
1113 JBUFFER_TRACE(jh, "exit");
1114 journal_put_journal_head(jh);
1115 return ret;
1116}
1117
1118/**
1119 * int journal_dirty_metadata() - mark a buffer as containing dirty metadata
1120 * @handle: transaction to add buffer to.
1121 * @bh: buffer to mark
1122 *
1123 * Mark dirty metadata which needs to be journaled as part of the current
1124 * transaction.
1125 *
1126 * The buffer is placed on the transaction's metadata list and is marked
1127 * as belonging to the transaction.
1128 *
1129 * Returns error number or 0 on success.
1130 *
1131 * Special care needs to be taken if the buffer already belongs to the
1132 * current committing transaction (in which case we should have frozen
1133 * data present for that commit). In that case, we don't relink the
1134 * buffer: that only gets done when the old transaction finally
1135 * completes its commit.
1136 */
1137int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1138{
1139 transaction_t *transaction = handle->h_transaction;
1140 journal_t *journal = transaction->t_journal;
1141 struct journal_head *jh = bh2jh(bh);
1142
1143 jbd_debug(5, "journal_head %p\n", jh);
1144 JBUFFER_TRACE(jh, "entry");
1145 if (is_handle_aborted(handle))
1146 goto out;
1147
1148 jbd_lock_bh_state(bh);
1149
1150 if (jh->b_modified == 0) {
1151 /*
1152 * This buffer's got modified and becoming part
1153 * of the transaction. This needs to be done
1154 * once a transaction -bzzz
1155 */
1156 jh->b_modified = 1;
1157 J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
1158 handle->h_buffer_credits--;
1159 }
1160
1161 /*
1162 * fastpath, to avoid expensive locking. If this buffer is already
1163 * on the running transaction's metadata list there is nothing to do.
1164 * Nobody can take it off again because there is a handle open.
1165 * I _think_ we're OK here with SMP barriers - a mistaken decision will
1166 * result in this test being false, so we go in and take the locks.
1167 */
1168 if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1169 JBUFFER_TRACE(jh, "fastpath");
1170 J_ASSERT_JH(jh, jh->b_transaction ==
1171 journal->j_running_transaction);
1172 goto out_unlock_bh;
1173 }
1174
1175 set_buffer_jbddirty(bh);
1176
1177 /*
1178 * Metadata already on the current transaction list doesn't
1179 * need to be filed. Metadata on another transaction's list must
1180 * be committing, and will be refiled once the commit completes:
1181 * leave it alone for now.
1182 */
1183 if (jh->b_transaction != transaction) {
1184 JBUFFER_TRACE(jh, "already on other transaction");
1185 J_ASSERT_JH(jh, jh->b_transaction ==
1186 journal->j_committing_transaction);
1187 J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
1188 /* And this case is illegal: we can't reuse another
1189 * transaction's data buffer, ever. */
1190 goto out_unlock_bh;
1191 }
1192
1193 /* That test should have eliminated the following case: */
1194 J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
1195
1196 JBUFFER_TRACE(jh, "file as BJ_Metadata");
1197 spin_lock(&journal->j_list_lock);
1198 __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
1199 spin_unlock(&journal->j_list_lock);
1200out_unlock_bh:
1201 jbd_unlock_bh_state(bh);
1202out:
1203 JBUFFER_TRACE(jh, "exit");
1204 return 0;
1205}
1206
1207/*
1208 * journal_release_buffer: undo a get_write_access without any buffer
1209 * updates, if the update decided in the end that it didn't need access.
1210 *
1211 */
1212void
1213journal_release_buffer(handle_t *handle, struct buffer_head *bh)
1214{
1215 BUFFER_TRACE(bh, "entry");
1216}
1217
1218/**
1219 * void journal_forget() - bforget() for potentially-journaled buffers.
1220 * @handle: transaction handle
1221 * @bh: bh to 'forget'
1222 *
1223 * We can only do the bforget if there are no commits pending against the
1224 * buffer. If the buffer is dirty in the current running transaction we
1225 * can safely unlink it.
1226 *
1227 * bh may not be a journalled buffer at all - it may be a non-JBD
1228 * buffer which came off the hashtable. Check for this.
1229 *
1230 * Decrements bh->b_count by one.
1231 *
1232 * Allow this call even if the handle has aborted --- it may be part of
1233 * the caller's cleanup after an abort.
1234 */
1235int journal_forget (handle_t *handle, struct buffer_head *bh)
1236{
1237 transaction_t *transaction = handle->h_transaction;
1238 journal_t *journal = transaction->t_journal;
1239 struct journal_head *jh;
1240 int drop_reserve = 0;
1241 int err = 0;
1242 int was_modified = 0;
1243
1244 BUFFER_TRACE(bh, "entry");
1245
1246 jbd_lock_bh_state(bh);
1247 spin_lock(&journal->j_list_lock);
1248
1249 if (!buffer_jbd(bh))
1250 goto not_jbd;
1251 jh = bh2jh(bh);
1252
1253 /* Critical error: attempting to delete a bitmap buffer, maybe?
1254 * Don't do any jbd operations, and return an error. */
1255 if (!J_EXPECT_JH(jh, !jh->b_committed_data,
1256 "inconsistent data on disk")) {
1257 err = -EIO;
1258 goto not_jbd;
1259 }
1260
1261 /* keep track of whether or not this transaction modified us */
1262 was_modified = jh->b_modified;
1263
1264 /*
1265 * The buffer's going from the transaction, we must drop
1266 * all references -bzzz
1267 */
1268 jh->b_modified = 0;
1269
1270 if (jh->b_transaction == handle->h_transaction) {
1271 J_ASSERT_JH(jh, !jh->b_frozen_data);
1272
1273 /* If we are forgetting a buffer which is already part
1274 * of this transaction, then we can just drop it from
1275 * the transaction immediately. */
1276 clear_buffer_dirty(bh);
1277 clear_buffer_jbddirty(bh);
1278
1279 JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
1280
1281 /*
1282 * we only want to drop a reference if this transaction
1283 * modified the buffer
1284 */
1285 if (was_modified)
1286 drop_reserve = 1;
1287
1288 /*
1289 * We are no longer going to journal this buffer.
1290 * However, the commit of this transaction is still
1291 * important to the buffer: the delete that we are now
1292 * processing might obsolete an old log entry, so by
1293 * committing, we can satisfy the buffer's checkpoint.
1294 *
1295 * So, if we have a checkpoint on the buffer, we should
1296 * now refile the buffer on our BJ_Forget list so that
1297 * we know to remove the checkpoint after we commit.
1298 */
1299
1300 if (jh->b_cp_transaction) {
1301 __journal_temp_unlink_buffer(jh);
1302 __journal_file_buffer(jh, transaction, BJ_Forget);
1303 } else {
1304 __journal_unfile_buffer(jh);
1305 if (!buffer_jbd(bh)) {
1306 spin_unlock(&journal->j_list_lock);
1307 jbd_unlock_bh_state(bh);
1308 __bforget(bh);
1309 goto drop;
1310 }
1311 }
1312 } else if (jh->b_transaction) {
1313 J_ASSERT_JH(jh, (jh->b_transaction ==
1314 journal->j_committing_transaction));
1315 /* However, if the buffer is still owned by a prior
1316 * (committing) transaction, we can't drop it yet... */
1317 JBUFFER_TRACE(jh, "belongs to older transaction");
1318 /* ... but we CAN drop it from the new transaction if we
1319 * have also modified it since the original commit. */
1320
1321 if (jh->b_next_transaction) {
1322 J_ASSERT(jh->b_next_transaction == transaction);
1323 jh->b_next_transaction = NULL;
1324
1325 /*
1326 * only drop a reference if this transaction modified
1327 * the buffer
1328 */
1329 if (was_modified)
1330 drop_reserve = 1;
1331 }
1332 }
1333
1334not_jbd:
1335 spin_unlock(&journal->j_list_lock);
1336 jbd_unlock_bh_state(bh);
1337 __brelse(bh);
1338drop:
1339 if (drop_reserve) {
1340 /* no need to reserve log space for this block -bzzz */
1341 handle->h_buffer_credits++;
1342 }
1343 return err;
1344}
1345
1346/**
1347 * int journal_stop() - complete a transaction
1348 * @handle: tranaction to complete.
1349 *
1350 * All done for a particular handle.
1351 *
1352 * There is not much action needed here. We just return any remaining
1353 * buffer credits to the transaction and remove the handle. The only
1354 * complication is that we need to start a commit operation if the
1355 * filesystem is marked for synchronous update.
1356 *
1357 * journal_stop itself will not usually return an error, but it may
1358 * do so in unusual circumstances. In particular, expect it to
1359 * return -EIO if a journal_abort has been executed since the
1360 * transaction began.
1361 */
1362int journal_stop(handle_t *handle)
1363{
1364 transaction_t *transaction = handle->h_transaction;
1365 journal_t *journal = transaction->t_journal;
1366 int err;
1367 pid_t pid;
1368
1369 J_ASSERT(journal_current_handle() == handle);
1370
1371 if (is_handle_aborted(handle))
1372 err = -EIO;
1373 else {
1374 J_ASSERT(transaction->t_updates > 0);
1375 err = 0;
1376 }
1377
1378 if (--handle->h_ref > 0) {
1379 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
1380 handle->h_ref);
1381 return err;
1382 }
1383
1384 jbd_debug(4, "Handle %p going down\n", handle);
1385
1386 /*
1387 * Implement synchronous transaction batching. If the handle
1388 * was synchronous, don't force a commit immediately. Let's
1389 * yield and let another thread piggyback onto this transaction.
1390 * Keep doing that while new threads continue to arrive.
1391 * It doesn't cost much - we're about to run a commit and sleep
1392 * on IO anyway. Speeds up many-threaded, many-dir operations
1393 * by 30x or more...
1394 *
1395 * We try and optimize the sleep time against what the underlying disk
1396 * can do, instead of having a static sleep time. This is useful for
1397 * the case where our storage is so fast that it is more optimal to go
1398 * ahead and force a flush and wait for the transaction to be committed
1399 * than it is to wait for an arbitrary amount of time for new writers to
1400 * join the transaction. We achieve this by measuring how long it takes
1401 * to commit a transaction, and compare it with how long this
1402 * transaction has been running, and if run time < commit time then we
1403 * sleep for the delta and commit. This greatly helps super fast disks
1404 * that would see slowdowns as more threads started doing fsyncs.
1405 *
1406 * But don't do this if this process was the most recent one to
1407 * perform a synchronous write. We do this to detect the case where a
1408 * single process is doing a stream of sync writes. No point in waiting
1409 * for joiners in that case.
1410 */
1411 pid = current->pid;
1412 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1413 u64 commit_time, trans_time;
1414
1415 journal->j_last_sync_writer = pid;
1416
1417 spin_lock(&journal->j_state_lock);
1418 commit_time = journal->j_average_commit_time;
1419 spin_unlock(&journal->j_state_lock);
1420
1421 trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1422 transaction->t_start_time));
1423
1424 commit_time = min_t(u64, commit_time,
1425 1000*jiffies_to_usecs(1));
1426
1427 if (trans_time < commit_time) {
1428 ktime_t expires = ktime_add_ns(ktime_get(),
1429 commit_time);
1430 set_current_state(TASK_UNINTERRUPTIBLE);
1431 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1432 }
1433 }
1434
1435 current->journal_info = NULL;
1436 spin_lock(&journal->j_state_lock);
1437 spin_lock(&transaction->t_handle_lock);
1438 transaction->t_outstanding_credits -= handle->h_buffer_credits;
1439 transaction->t_updates--;
1440 if (!transaction->t_updates) {
1441 wake_up(&journal->j_wait_updates);
1442 if (journal->j_barrier_count)
1443 wake_up(&journal->j_wait_transaction_locked);
1444 }
1445
1446 /*
1447 * If the handle is marked SYNC, we need to set another commit
1448 * going! We also want to force a commit if the current
1449 * transaction is occupying too much of the log, or if the
1450 * transaction is too old now.
1451 */
1452 if (handle->h_sync ||
1453 transaction->t_outstanding_credits >
1454 journal->j_max_transaction_buffers ||
1455 time_after_eq(jiffies, transaction->t_expires)) {
1456 /* Do this even for aborted journals: an abort still
1457 * completes the commit thread, it just doesn't write
1458 * anything to disk. */
1459 tid_t tid = transaction->t_tid;
1460
1461 spin_unlock(&transaction->t_handle_lock);
1462 jbd_debug(2, "transaction too old, requesting commit for "
1463 "handle %p\n", handle);
1464 /* This is non-blocking */
1465 __log_start_commit(journal, transaction->t_tid);
1466 spin_unlock(&journal->j_state_lock);
1467
1468 /*
1469 * Special case: JFS_SYNC synchronous updates require us
1470 * to wait for the commit to complete.
1471 */
1472 if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1473 err = log_wait_commit(journal, tid);
1474 } else {
1475 spin_unlock(&transaction->t_handle_lock);
1476 spin_unlock(&journal->j_state_lock);
1477 }
1478
1479 lock_map_release(&handle->h_lockdep_map);
1480
1481 jbd_free_handle(handle);
1482 return err;
1483}
1484
1485/**
1486 * int journal_force_commit() - force any uncommitted transactions
1487 * @journal: journal to force
1488 *
1489 * For synchronous operations: force any uncommitted transactions
1490 * to disk. May seem kludgy, but it reuses all the handle batching
1491 * code in a very simple manner.
1492 */
1493int journal_force_commit(journal_t *journal)
1494{
1495 handle_t *handle;
1496 int ret;
1497
1498 handle = journal_start(journal, 1);
1499 if (IS_ERR(handle)) {
1500 ret = PTR_ERR(handle);
1501 } else {
1502 handle->h_sync = 1;
1503 ret = journal_stop(handle);
1504 }
1505 return ret;
1506}
1507
1508/*
1509 *
1510 * List management code snippets: various functions for manipulating the
1511 * transaction buffer lists.
1512 *
1513 */
1514
1515/*
1516 * Append a buffer to a transaction list, given the transaction's list head
1517 * pointer.
1518 *
1519 * j_list_lock is held.
1520 *
1521 * jbd_lock_bh_state(jh2bh(jh)) is held.
1522 */
1523
1524static inline void
1525__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
1526{
1527 if (!*list) {
1528 jh->b_tnext = jh->b_tprev = jh;
1529 *list = jh;
1530 } else {
1531 /* Insert at the tail of the list to preserve order */
1532 struct journal_head *first = *list, *last = first->b_tprev;
1533 jh->b_tprev = last;
1534 jh->b_tnext = first;
1535 last->b_tnext = first->b_tprev = jh;
1536 }
1537}
1538
1539/*
1540 * Remove a buffer from a transaction list, given the transaction's list
1541 * head pointer.
1542 *
1543 * Called with j_list_lock held, and the journal may not be locked.
1544 *
1545 * jbd_lock_bh_state(jh2bh(jh)) is held.
1546 */
1547
1548static inline void
1549__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1550{
1551 if (*list == jh) {
1552 *list = jh->b_tnext;
1553 if (*list == jh)
1554 *list = NULL;
1555 }
1556 jh->b_tprev->b_tnext = jh->b_tnext;
1557 jh->b_tnext->b_tprev = jh->b_tprev;
1558}
1559
1560/*
1561 * Remove a buffer from the appropriate transaction list.
1562 *
1563 * Note that this function can *change* the value of
1564 * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
1565 * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
1566 * is holding onto a copy of one of thee pointers, it could go bad.
1567 * Generally the caller needs to re-read the pointer from the transaction_t.
1568 *
1569 * Called under j_list_lock. The journal may not be locked.
1570 */
1571static void __journal_temp_unlink_buffer(struct journal_head *jh)
1572{
1573 struct journal_head **list = NULL;
1574 transaction_t *transaction;
1575 struct buffer_head *bh = jh2bh(jh);
1576
1577 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
1578 transaction = jh->b_transaction;
1579 if (transaction)
1580 assert_spin_locked(&transaction->t_journal->j_list_lock);
1581
1582 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1583 if (jh->b_jlist != BJ_None)
1584 J_ASSERT_JH(jh, transaction != NULL);
1585
1586 switch (jh->b_jlist) {
1587 case BJ_None:
1588 return;
1589 case BJ_SyncData:
1590 list = &transaction->t_sync_datalist;
1591 break;
1592 case BJ_Metadata:
1593 transaction->t_nr_buffers--;
1594 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
1595 list = &transaction->t_buffers;
1596 break;
1597 case BJ_Forget:
1598 list = &transaction->t_forget;
1599 break;
1600 case BJ_IO:
1601 list = &transaction->t_iobuf_list;
1602 break;
1603 case BJ_Shadow:
1604 list = &transaction->t_shadow_list;
1605 break;
1606 case BJ_LogCtl:
1607 list = &transaction->t_log_list;
1608 break;
1609 case BJ_Reserved:
1610 list = &transaction->t_reserved_list;
1611 break;
1612 case BJ_Locked:
1613 list = &transaction->t_locked_list;
1614 break;
1615 }
1616
1617 __blist_del_buffer(list, jh);
1618 jh->b_jlist = BJ_None;
1619 if (test_clear_buffer_jbddirty(bh))
1620 mark_buffer_dirty(bh); /* Expose it to the VM */
1621}
1622
1623/*
1624 * Remove buffer from all transactions.
1625 *
1626 * Called with bh_state lock and j_list_lock
1627 *
1628 * jh and bh may be already freed when this function returns.
1629 */
1630void __journal_unfile_buffer(struct journal_head *jh)
1631{
1632 __journal_temp_unlink_buffer(jh);
1633 jh->b_transaction = NULL;
1634 journal_put_journal_head(jh);
1635}
1636
1637void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
1638{
1639 struct buffer_head *bh = jh2bh(jh);
1640
1641 /* Get reference so that buffer cannot be freed before we unlock it */
1642 get_bh(bh);
1643 jbd_lock_bh_state(bh);
1644 spin_lock(&journal->j_list_lock);
1645 __journal_unfile_buffer(jh);
1646 spin_unlock(&journal->j_list_lock);
1647 jbd_unlock_bh_state(bh);
1648 __brelse(bh);
1649}
1650
1651/*
1652 * Called from journal_try_to_free_buffers().
1653 *
1654 * Called under jbd_lock_bh_state(bh)
1655 */
1656static void
1657__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1658{
1659 struct journal_head *jh;
1660
1661 jh = bh2jh(bh);
1662
1663 if (buffer_locked(bh) || buffer_dirty(bh))
1664 goto out;
1665
1666 if (jh->b_next_transaction != NULL)
1667 goto out;
1668
1669 spin_lock(&journal->j_list_lock);
1670 if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
1671 if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
1672 /* A written-back ordered data buffer */
1673 JBUFFER_TRACE(jh, "release data");
1674 __journal_unfile_buffer(jh);
1675 }
1676 } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1677 /* written-back checkpointed metadata buffer */
1678 if (jh->b_jlist == BJ_None) {
1679 JBUFFER_TRACE(jh, "remove from checkpoint list");
1680 __journal_remove_checkpoint(jh);
1681 }
1682 }
1683 spin_unlock(&journal->j_list_lock);
1684out:
1685 return;
1686}
1687
1688/**
1689 * int journal_try_to_free_buffers() - try to free page buffers.
1690 * @journal: journal for operation
1691 * @page: to try and free
1692 * @gfp_mask: we use the mask to detect how hard should we try to release
1693 * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
1694 * release the buffers.
1695 *
1696 *
1697 * For all the buffers on this page,
1698 * if they are fully written out ordered data, move them onto BUF_CLEAN
1699 * so try_to_free_buffers() can reap them.
1700 *
1701 * This function returns non-zero if we wish try_to_free_buffers()
1702 * to be called. We do this if the page is releasable by try_to_free_buffers().
1703 * We also do it if the page has locked or dirty buffers and the caller wants
1704 * us to perform sync or async writeout.
1705 *
1706 * This complicates JBD locking somewhat. We aren't protected by the
1707 * BKL here. We wish to remove the buffer from its committing or
1708 * running transaction's ->t_datalist via __journal_unfile_buffer.
1709 *
1710 * This may *change* the value of transaction_t->t_datalist, so anyone
1711 * who looks at t_datalist needs to lock against this function.
1712 *
1713 * Even worse, someone may be doing a journal_dirty_data on this
1714 * buffer. So we need to lock against that. journal_dirty_data()
1715 * will come out of the lock with the buffer dirty, which makes it
1716 * ineligible for release here.
1717 *
1718 * Who else is affected by this? hmm... Really the only contender
1719 * is do_get_write_access() - it could be looking at the buffer while
1720 * journal_try_to_free_buffer() is changing its state. But that
1721 * cannot happen because we never reallocate freed data as metadata
1722 * while the data is part of a transaction. Yes?
1723 *
1724 * Return 0 on failure, 1 on success
1725 */
1726int journal_try_to_free_buffers(journal_t *journal,
1727 struct page *page, gfp_t gfp_mask)
1728{
1729 struct buffer_head *head;
1730 struct buffer_head *bh;
1731 int ret = 0;
1732
1733 J_ASSERT(PageLocked(page));
1734
1735 head = page_buffers(page);
1736 bh = head;
1737 do {
1738 struct journal_head *jh;
1739
1740 /*
1741 * We take our own ref against the journal_head here to avoid
1742 * having to add tons of locking around each instance of
1743 * journal_put_journal_head().
1744 */
1745 jh = journal_grab_journal_head(bh);
1746 if (!jh)
1747 continue;
1748
1749 jbd_lock_bh_state(bh);
1750 __journal_try_to_free_buffer(journal, bh);
1751 journal_put_journal_head(jh);
1752 jbd_unlock_bh_state(bh);
1753 if (buffer_jbd(bh))
1754 goto busy;
1755 } while ((bh = bh->b_this_page) != head);
1756
1757 ret = try_to_free_buffers(page);
1758
1759busy:
1760 return ret;
1761}
1762
1763/*
1764 * This buffer is no longer needed. If it is on an older transaction's
1765 * checkpoint list we need to record it on this transaction's forget list
1766 * to pin this buffer (and hence its checkpointing transaction) down until
1767 * this transaction commits. If the buffer isn't on a checkpoint list, we
1768 * release it.
1769 * Returns non-zero if JBD no longer has an interest in the buffer.
1770 *
1771 * Called under j_list_lock.
1772 *
1773 * Called under jbd_lock_bh_state(bh).
1774 */
1775static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1776{
1777 int may_free = 1;
1778 struct buffer_head *bh = jh2bh(jh);
1779
1780 if (jh->b_cp_transaction) {
1781 JBUFFER_TRACE(jh, "on running+cp transaction");
1782 __journal_temp_unlink_buffer(jh);
1783 /*
1784 * We don't want to write the buffer anymore, clear the
1785 * bit so that we don't confuse checks in
1786 * __journal_file_buffer
1787 */
1788 clear_buffer_dirty(bh);
1789 __journal_file_buffer(jh, transaction, BJ_Forget);
1790 may_free = 0;
1791 } else {
1792 JBUFFER_TRACE(jh, "on running transaction");
1793 __journal_unfile_buffer(jh);
1794 }
1795 return may_free;
1796}
1797
1798/*
1799 * journal_invalidatepage
1800 *
1801 * This code is tricky. It has a number of cases to deal with.
1802 *
1803 * There are two invariants which this code relies on:
1804 *
1805 * i_size must be updated on disk before we start calling invalidatepage on the
1806 * data.
1807 *
1808 * This is done in ext3 by defining an ext3_setattr method which
1809 * updates i_size before truncate gets going. By maintaining this
1810 * invariant, we can be sure that it is safe to throw away any buffers
1811 * attached to the current transaction: once the transaction commits,
1812 * we know that the data will not be needed.
1813 *
1814 * Note however that we can *not* throw away data belonging to the
1815 * previous, committing transaction!
1816 *
1817 * Any disk blocks which *are* part of the previous, committing
1818 * transaction (and which therefore cannot be discarded immediately) are
1819 * not going to be reused in the new running transaction
1820 *
1821 * The bitmap committed_data images guarantee this: any block which is
1822 * allocated in one transaction and removed in the next will be marked
1823 * as in-use in the committed_data bitmap, so cannot be reused until
1824 * the next transaction to delete the block commits. This means that
1825 * leaving committing buffers dirty is quite safe: the disk blocks
1826 * cannot be reallocated to a different file and so buffer aliasing is
1827 * not possible.
1828 *
1829 *
1830 * The above applies mainly to ordered data mode. In writeback mode we
1831 * don't make guarantees about the order in which data hits disk --- in
1832 * particular we don't guarantee that new dirty data is flushed before
1833 * transaction commit --- so it is always safe just to discard data
1834 * immediately in that mode. --sct
1835 */
1836
1837/*
1838 * The journal_unmap_buffer helper function returns zero if the buffer
1839 * concerned remains pinned as an anonymous buffer belonging to an older
1840 * transaction.
1841 *
1842 * We're outside-transaction here. Either or both of j_running_transaction
1843 * and j_committing_transaction may be NULL.
1844 */
1845static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
1846 int partial_page)
1847{
1848 transaction_t *transaction;
1849 struct journal_head *jh;
1850 int may_free = 1;
1851
1852 BUFFER_TRACE(bh, "entry");
1853
1854retry:
1855 /*
1856 * It is safe to proceed here without the j_list_lock because the
1857 * buffers cannot be stolen by try_to_free_buffers as long as we are
1858 * holding the page lock. --sct
1859 */
1860
1861 if (!buffer_jbd(bh))
1862 goto zap_buffer_unlocked;
1863
1864 spin_lock(&journal->j_state_lock);
1865 jbd_lock_bh_state(bh);
1866 spin_lock(&journal->j_list_lock);
1867
1868 jh = journal_grab_journal_head(bh);
1869 if (!jh)
1870 goto zap_buffer_no_jh;
1871
1872 /*
1873 * We cannot remove the buffer from checkpoint lists until the
1874 * transaction adding inode to orphan list (let's call it T)
1875 * is committed. Otherwise if the transaction changing the
1876 * buffer would be cleaned from the journal before T is
1877 * committed, a crash will cause that the correct contents of
1878 * the buffer will be lost. On the other hand we have to
1879 * clear the buffer dirty bit at latest at the moment when the
1880 * transaction marking the buffer as freed in the filesystem
1881 * structures is committed because from that moment on the
1882 * block can be reallocated and used by a different page.
1883 * Since the block hasn't been freed yet but the inode has
1884 * already been added to orphan list, it is safe for us to add
1885 * the buffer to BJ_Forget list of the newest transaction.
1886 *
1887 * Also we have to clear buffer_mapped flag of a truncated buffer
1888 * because the buffer_head may be attached to the page straddling
1889 * i_size (can happen only when blocksize < pagesize) and thus the
1890 * buffer_head can be reused when the file is extended again. So we end
1891 * up keeping around invalidated buffers attached to transactions'
1892 * BJ_Forget list just to stop checkpointing code from cleaning up
1893 * the transaction this buffer was modified in.
1894 */
1895 transaction = jh->b_transaction;
1896 if (transaction == NULL) {
1897 /* First case: not on any transaction. If it
1898 * has no checkpoint link, then we can zap it:
1899 * it's a writeback-mode buffer so we don't care
1900 * if it hits disk safely. */
1901 if (!jh->b_cp_transaction) {
1902 JBUFFER_TRACE(jh, "not on any transaction: zap");
1903 goto zap_buffer;
1904 }
1905
1906 if (!buffer_dirty(bh)) {
1907 /* bdflush has written it. We can drop it now */
1908 goto zap_buffer;
1909 }
1910
1911 /* OK, it must be in the journal but still not
1912 * written fully to disk: it's metadata or
1913 * journaled data... */
1914
1915 if (journal->j_running_transaction) {
1916 /* ... and once the current transaction has
1917 * committed, the buffer won't be needed any
1918 * longer. */
1919 JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
1920 may_free = __dispose_buffer(jh,
1921 journal->j_running_transaction);
1922 goto zap_buffer;
1923 } else {
1924 /* There is no currently-running transaction. So the
1925 * orphan record which we wrote for this file must have
1926 * passed into commit. We must attach this buffer to
1927 * the committing transaction, if it exists. */
1928 if (journal->j_committing_transaction) {
1929 JBUFFER_TRACE(jh, "give to committing trans");
1930 may_free = __dispose_buffer(jh,
1931 journal->j_committing_transaction);
1932 goto zap_buffer;
1933 } else {
1934 /* The orphan record's transaction has
1935 * committed. We can cleanse this buffer */
1936 clear_buffer_jbddirty(bh);
1937 goto zap_buffer;
1938 }
1939 }
1940 } else if (transaction == journal->j_committing_transaction) {
1941 JBUFFER_TRACE(jh, "on committing transaction");
1942 if (jh->b_jlist == BJ_Locked) {
1943 /*
1944 * The buffer is on the committing transaction's locked
1945 * list. We have the buffer locked, so I/O has
1946 * completed. So we can nail the buffer now.
1947 */
1948 may_free = __dispose_buffer(jh, transaction);
1949 goto zap_buffer;
1950 }
1951 /*
1952 * The buffer is committing, we simply cannot touch
1953 * it. If the page is straddling i_size we have to wait
1954 * for commit and try again.
1955 */
1956 if (partial_page) {
1957 tid_t tid = journal->j_committing_transaction->t_tid;
1958
1959 journal_put_journal_head(jh);
1960 spin_unlock(&journal->j_list_lock);
1961 jbd_unlock_bh_state(bh);
1962 spin_unlock(&journal->j_state_lock);
1963 unlock_buffer(bh);
1964 log_wait_commit(journal, tid);
1965 lock_buffer(bh);
1966 goto retry;
1967 }
1968 /*
1969 * OK, buffer won't be reachable after truncate. We just set
1970 * j_next_transaction to the running transaction (if there is
1971 * one) and mark buffer as freed so that commit code knows it
1972 * should clear dirty bits when it is done with the buffer.
1973 */
1974 set_buffer_freed(bh);
1975 if (journal->j_running_transaction && buffer_jbddirty(bh))
1976 jh->b_next_transaction = journal->j_running_transaction;
1977 journal_put_journal_head(jh);
1978 spin_unlock(&journal->j_list_lock);
1979 jbd_unlock_bh_state(bh);
1980 spin_unlock(&journal->j_state_lock);
1981 return 0;
1982 } else {
1983 /* Good, the buffer belongs to the running transaction.
1984 * We are writing our own transaction's data, not any
1985 * previous one's, so it is safe to throw it away
1986 * (remember that we expect the filesystem to have set
1987 * i_size already for this truncate so recovery will not
1988 * expose the disk blocks we are discarding here.) */
1989 J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
1990 JBUFFER_TRACE(jh, "on running transaction");
1991 may_free = __dispose_buffer(jh, transaction);
1992 }
1993
1994zap_buffer:
1995 /*
1996 * This is tricky. Although the buffer is truncated, it may be reused
1997 * if blocksize < pagesize and it is attached to the page straddling
1998 * EOF. Since the buffer might have been added to BJ_Forget list of the
1999 * running transaction, journal_get_write_access() won't clear
2000 * b_modified and credit accounting gets confused. So clear b_modified
2001 * here. */
2002 jh->b_modified = 0;
2003 journal_put_journal_head(jh);
2004zap_buffer_no_jh:
2005 spin_unlock(&journal->j_list_lock);
2006 jbd_unlock_bh_state(bh);
2007 spin_unlock(&journal->j_state_lock);
2008zap_buffer_unlocked:
2009 clear_buffer_dirty(bh);
2010 J_ASSERT_BH(bh, !buffer_jbddirty(bh));
2011 clear_buffer_mapped(bh);
2012 clear_buffer_req(bh);
2013 clear_buffer_new(bh);
2014 bh->b_bdev = NULL;
2015 return may_free;
2016}
2017
2018/**
2019 * void journal_invalidatepage() - invalidate a journal page
2020 * @journal: journal to use for flush
2021 * @page: page to flush
2022 * @offset: offset of the range to invalidate
2023 * @length: length of the range to invalidate
2024 *
2025 * Reap page buffers containing data in specified range in page.
2026 */
2027void journal_invalidatepage(journal_t *journal,
2028 struct page *page,
2029 unsigned int offset,
2030 unsigned int length)
2031{
2032 struct buffer_head *head, *bh, *next;
2033 unsigned int stop = offset + length;
2034 unsigned int curr_off = 0;
2035 int partial_page = (offset || length < PAGE_CACHE_SIZE);
2036 int may_free = 1;
2037
2038 if (!PageLocked(page))
2039 BUG();
2040 if (!page_has_buffers(page))
2041 return;
2042
2043 BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
2044
2045 /* We will potentially be playing with lists other than just the
2046 * data lists (especially for journaled data mode), so be
2047 * cautious in our locking. */
2048
2049 head = bh = page_buffers(page);
2050 do {
2051 unsigned int next_off = curr_off + bh->b_size;
2052 next = bh->b_this_page;
2053
2054 if (next_off > stop)
2055 return;
2056
2057 if (offset <= curr_off) {
2058 /* This block is wholly outside the truncation point */
2059 lock_buffer(bh);
2060 may_free &= journal_unmap_buffer(journal, bh,
2061 partial_page);
2062 unlock_buffer(bh);
2063 }
2064 curr_off = next_off;
2065 bh = next;
2066
2067 } while (bh != head);
2068
2069 if (!partial_page) {
2070 if (may_free && try_to_free_buffers(page))
2071 J_ASSERT(!page_has_buffers(page));
2072 }
2073}
2074
2075/*
2076 * File a buffer on the given transaction list.
2077 */
2078void __journal_file_buffer(struct journal_head *jh,
2079 transaction_t *transaction, int jlist)
2080{
2081 struct journal_head **list = NULL;
2082 int was_dirty = 0;
2083 struct buffer_head *bh = jh2bh(jh);
2084
2085 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
2086 assert_spin_locked(&transaction->t_journal->j_list_lock);
2087
2088 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
2089 J_ASSERT_JH(jh, jh->b_transaction == transaction ||
2090 jh->b_transaction == NULL);
2091
2092 if (jh->b_transaction && jh->b_jlist == jlist)
2093 return;
2094
2095 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
2096 jlist == BJ_Shadow || jlist == BJ_Forget) {
2097 /*
2098 * For metadata buffers, we track dirty bit in buffer_jbddirty
2099 * instead of buffer_dirty. We should not see a dirty bit set
2100 * here because we clear it in do_get_write_access but e.g.
2101 * tune2fs can modify the sb and set the dirty bit at any time
2102 * so we try to gracefully handle that.
2103 */
2104 if (buffer_dirty(bh))
2105 warn_dirty_buffer(bh);
2106 if (test_clear_buffer_dirty(bh) ||
2107 test_clear_buffer_jbddirty(bh))
2108 was_dirty = 1;
2109 }
2110
2111 if (jh->b_transaction)
2112 __journal_temp_unlink_buffer(jh);
2113 else
2114 journal_grab_journal_head(bh);
2115 jh->b_transaction = transaction;
2116
2117 switch (jlist) {
2118 case BJ_None:
2119 J_ASSERT_JH(jh, !jh->b_committed_data);
2120 J_ASSERT_JH(jh, !jh->b_frozen_data);
2121 return;
2122 case BJ_SyncData:
2123 list = &transaction->t_sync_datalist;
2124 break;
2125 case BJ_Metadata:
2126 transaction->t_nr_buffers++;
2127 list = &transaction->t_buffers;
2128 break;
2129 case BJ_Forget:
2130 list = &transaction->t_forget;
2131 break;
2132 case BJ_IO:
2133 list = &transaction->t_iobuf_list;
2134 break;
2135 case BJ_Shadow:
2136 list = &transaction->t_shadow_list;
2137 break;
2138 case BJ_LogCtl:
2139 list = &transaction->t_log_list;
2140 break;
2141 case BJ_Reserved:
2142 list = &transaction->t_reserved_list;
2143 break;
2144 case BJ_Locked:
2145 list = &transaction->t_locked_list;
2146 break;
2147 }
2148
2149 __blist_add_buffer(list, jh);
2150 jh->b_jlist = jlist;
2151
2152 if (was_dirty)
2153 set_buffer_jbddirty(bh);
2154}
2155
2156void journal_file_buffer(struct journal_head *jh,
2157 transaction_t *transaction, int jlist)
2158{
2159 jbd_lock_bh_state(jh2bh(jh));
2160 spin_lock(&transaction->t_journal->j_list_lock);
2161 __journal_file_buffer(jh, transaction, jlist);
2162 spin_unlock(&transaction->t_journal->j_list_lock);
2163 jbd_unlock_bh_state(jh2bh(jh));
2164}
2165
2166/*
2167 * Remove a buffer from its current buffer list in preparation for
2168 * dropping it from its current transaction entirely. If the buffer has
2169 * already started to be used by a subsequent transaction, refile the
2170 * buffer on that transaction's metadata list.
2171 *
2172 * Called under j_list_lock
2173 * Called under jbd_lock_bh_state(jh2bh(jh))
2174 *
2175 * jh and bh may be already free when this function returns
2176 */
2177void __journal_refile_buffer(struct journal_head *jh)
2178{
2179 int was_dirty, jlist;
2180 struct buffer_head *bh = jh2bh(jh);
2181
2182 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
2183 if (jh->b_transaction)
2184 assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
2185
2186 /* If the buffer is now unused, just drop it. */
2187 if (jh->b_next_transaction == NULL) {
2188 __journal_unfile_buffer(jh);
2189 return;
2190 }
2191
2192 /*
2193 * It has been modified by a later transaction: add it to the new
2194 * transaction's metadata list.
2195 */
2196
2197 was_dirty = test_clear_buffer_jbddirty(bh);
2198 __journal_temp_unlink_buffer(jh);
2199 /*
2200 * We set b_transaction here because b_next_transaction will inherit
2201 * our jh reference and thus __journal_file_buffer() must not take a
2202 * new one.
2203 */
2204 jh->b_transaction = jh->b_next_transaction;
2205 jh->b_next_transaction = NULL;
2206 if (buffer_freed(bh))
2207 jlist = BJ_Forget;
2208 else if (jh->b_modified)
2209 jlist = BJ_Metadata;
2210 else
2211 jlist = BJ_Reserved;
2212 __journal_file_buffer(jh, jh->b_transaction, jlist);
2213 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2214
2215 if (was_dirty)
2216 set_buffer_jbddirty(bh);
2217}
2218
2219/*
2220 * __journal_refile_buffer() with necessary locking added. We take our bh
2221 * reference so that we can safely unlock bh.
2222 *
2223 * The jh and bh may be freed by this call.
2224 */
2225void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2226{
2227 struct buffer_head *bh = jh2bh(jh);
2228
2229 /* Get reference so that buffer cannot be freed before we unlock it */
2230 get_bh(bh);
2231 jbd_lock_bh_state(bh);
2232 spin_lock(&journal->j_list_lock);
2233 __journal_refile_buffer(jh);
2234 jbd_unlock_bh_state(bh);
2235 spin_unlock(&journal->j_list_lock);
2236 __brelse(bh);
2237}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index b9dc23cd04f2..0e026a7bdcd4 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -107,8 +107,11 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
107 if (rc) 107 if (rc)
108 return rc; 108 return rc;
109 109
110 if (is_quota_modification(inode, iattr)) 110 if (is_quota_modification(inode, iattr)) {
111 dquot_initialize(inode); 111 rc = dquot_initialize(inode);
112 if (rc)
113 return rc;
114 }
112 if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || 115 if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
113 (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { 116 (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
114 rc = dquot_transfer(inode, iattr); 117 rc = dquot_transfer(inode, iattr);
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 6b0f816201a2..cf7936fe2e68 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -109,7 +109,9 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
109 /* 109 /*
110 * Allocate inode to quota. 110 * Allocate inode to quota.
111 */ 111 */
112 dquot_initialize(inode); 112 rc = dquot_initialize(inode);
113 if (rc)
114 goto fail_drop;
113 rc = dquot_alloc_inode(inode); 115 rc = dquot_alloc_inode(inode);
114 if (rc) 116 if (rc)
115 goto fail_drop; 117 goto fail_drop;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a5ac97b9a933..35976bdccafc 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -86,7 +86,9 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode,
86 86
87 jfs_info("jfs_create: dip:0x%p name:%pd", dip, dentry); 87 jfs_info("jfs_create: dip:0x%p name:%pd", dip, dentry);
88 88
89 dquot_initialize(dip); 89 rc = dquot_initialize(dip);
90 if (rc)
91 goto out1;
90 92
91 /* 93 /*
92 * search parent directory for entry/freespace 94 * search parent directory for entry/freespace
@@ -218,7 +220,9 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
218 220
219 jfs_info("jfs_mkdir: dip:0x%p name:%pd", dip, dentry); 221 jfs_info("jfs_mkdir: dip:0x%p name:%pd", dip, dentry);
220 222
221 dquot_initialize(dip); 223 rc = dquot_initialize(dip);
224 if (rc)
225 goto out1;
222 226
223 /* 227 /*
224 * search parent directory for entry/freespace 228 * search parent directory for entry/freespace
@@ -355,8 +359,12 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
355 jfs_info("jfs_rmdir: dip:0x%p name:%pd", dip, dentry); 359 jfs_info("jfs_rmdir: dip:0x%p name:%pd", dip, dentry);
356 360
357 /* Init inode for quota operations. */ 361 /* Init inode for quota operations. */
358 dquot_initialize(dip); 362 rc = dquot_initialize(dip);
359 dquot_initialize(ip); 363 if (rc)
364 goto out;
365 rc = dquot_initialize(ip);
366 if (rc)
367 goto out;
360 368
361 /* directory must be empty to be removed */ 369 /* directory must be empty to be removed */
362 if (!dtEmpty(ip)) { 370 if (!dtEmpty(ip)) {
@@ -483,8 +491,12 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
483 jfs_info("jfs_unlink: dip:0x%p name:%pd", dip, dentry); 491 jfs_info("jfs_unlink: dip:0x%p name:%pd", dip, dentry);
484 492
485 /* Init inode for quota operations. */ 493 /* Init inode for quota operations. */
486 dquot_initialize(dip); 494 rc = dquot_initialize(dip);
487 dquot_initialize(ip); 495 if (rc)
496 goto out;
497 rc = dquot_initialize(ip);
498 if (rc)
499 goto out;
488 500
489 if ((rc = get_UCSname(&dname, dentry))) 501 if ((rc = get_UCSname(&dname, dentry)))
490 goto out; 502 goto out;
@@ -799,7 +811,9 @@ static int jfs_link(struct dentry *old_dentry,
799 811
800 jfs_info("jfs_link: %pd %pd", old_dentry, dentry); 812 jfs_info("jfs_link: %pd %pd", old_dentry, dentry);
801 813
802 dquot_initialize(dir); 814 rc = dquot_initialize(dir);
815 if (rc)
816 goto out;
803 817
804 tid = txBegin(ip->i_sb, 0); 818 tid = txBegin(ip->i_sb, 0);
805 819
@@ -810,7 +824,7 @@ static int jfs_link(struct dentry *old_dentry,
810 * scan parent directory for entry/freespace 824 * scan parent directory for entry/freespace
811 */ 825 */
812 if ((rc = get_UCSname(&dname, dentry))) 826 if ((rc = get_UCSname(&dname, dentry)))
813 goto out; 827 goto out_tx;
814 828
815 if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) 829 if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE)))
816 goto free_dname; 830 goto free_dname;
@@ -842,12 +856,13 @@ static int jfs_link(struct dentry *old_dentry,
842 free_dname: 856 free_dname:
843 free_UCSname(&dname); 857 free_UCSname(&dname);
844 858
845 out: 859 out_tx:
846 txEnd(tid); 860 txEnd(tid);
847 861
848 mutex_unlock(&JFS_IP(ip)->commit_mutex); 862 mutex_unlock(&JFS_IP(ip)->commit_mutex);
849 mutex_unlock(&JFS_IP(dir)->commit_mutex); 863 mutex_unlock(&JFS_IP(dir)->commit_mutex);
850 864
865 out:
851 jfs_info("jfs_link: rc:%d", rc); 866 jfs_info("jfs_link: rc:%d", rc);
852 return rc; 867 return rc;
853} 868}
@@ -891,7 +906,9 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
891 906
892 jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name); 907 jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name);
893 908
894 dquot_initialize(dip); 909 rc = dquot_initialize(dip);
910 if (rc)
911 goto out1;
895 912
896 ssize = strlen(name) + 1; 913 ssize = strlen(name) + 1;
897 914
@@ -1082,8 +1099,12 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1082 1099
1083 jfs_info("jfs_rename: %pd %pd", old_dentry, new_dentry); 1100 jfs_info("jfs_rename: %pd %pd", old_dentry, new_dentry);
1084 1101
1085 dquot_initialize(old_dir); 1102 rc = dquot_initialize(old_dir);
1086 dquot_initialize(new_dir); 1103 if (rc)
1104 goto out1;
1105 rc = dquot_initialize(new_dir);
1106 if (rc)
1107 goto out1;
1087 1108
1088 old_ip = d_inode(old_dentry); 1109 old_ip = d_inode(old_dentry);
1089 new_ip = d_inode(new_dentry); 1110 new_ip = d_inode(new_dentry);
@@ -1130,7 +1151,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1130 } else if (new_ip) { 1151 } else if (new_ip) {
1131 IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); 1152 IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
1132 /* Init inode for quota operations. */ 1153 /* Init inode for quota operations. */
1133 dquot_initialize(new_ip); 1154 rc = dquot_initialize(new_ip);
1155 if (rc)
1156 goto out_unlock;
1134 } 1157 }
1135 1158
1136 /* 1159 /*
@@ -1318,6 +1341,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1318 1341
1319 clear_cflag(COMMIT_Stale, old_dir); 1342 clear_cflag(COMMIT_Stale, old_dir);
1320 } 1343 }
1344 out_unlock:
1321 if (new_ip && !S_ISDIR(new_ip->i_mode)) 1345 if (new_ip && !S_ISDIR(new_ip->i_mode))
1322 IWRITE_UNLOCK(new_ip); 1346 IWRITE_UNLOCK(new_ip);
1323 out3: 1347 out3:
@@ -1353,7 +1377,9 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
1353 1377
1354 jfs_info("jfs_mknod: %pd", dentry); 1378 jfs_info("jfs_mknod: %pd", dentry);
1355 1379
1356 dquot_initialize(dir); 1380 rc = dquot_initialize(dir);
1381 if (rc)
1382 goto out;
1357 1383
1358 if ((rc = get_UCSname(&dname, dentry))) 1384 if ((rc = get_UCSname(&dname, dentry)))
1359 goto out; 1385 goto out;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 719f7f4c7a37..7210583b472f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -105,8 +105,11 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
105 file->f_path.dentry->d_name.len, 105 file->f_path.dentry->d_name.len,
106 file->f_path.dentry->d_name.name, mode); 106 file->f_path.dentry->d_name.name, mode);
107 107
108 if (file->f_mode & FMODE_WRITE) 108 if (file->f_mode & FMODE_WRITE) {
109 dquot_initialize(inode); 109 status = dquot_initialize(inode);
110 if (status)
111 goto leave;
112 }
110 113
111 spin_lock(&oi->ip_lock); 114 spin_lock(&oi->ip_lock);
112 115
@@ -1155,8 +1158,11 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1155 if (status) 1158 if (status)
1156 return status; 1159 return status;
1157 1160
1158 if (is_quota_modification(inode, attr)) 1161 if (is_quota_modification(inode, attr)) {
1159 dquot_initialize(inode); 1162 status = dquot_initialize(inode);
1163 if (status)
1164 return status;
1165 }
1160 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 1166 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
1161 if (size_change) { 1167 if (size_change) {
1162 status = ocfs2_rw_lock(inode, 1); 1168 status = ocfs2_rw_lock(inode, 1);
@@ -1209,8 +1215,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1209 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1215 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1210 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { 1216 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1211 transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid)); 1217 transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
1212 if (!transfer_to[USRQUOTA]) { 1218 if (IS_ERR(transfer_to[USRQUOTA])) {
1213 status = -ESRCH; 1219 status = PTR_ERR(transfer_to[USRQUOTA]);
1214 goto bail_unlock; 1220 goto bail_unlock;
1215 } 1221 }
1216 } 1222 }
@@ -1218,8 +1224,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1218 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1224 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1219 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { 1225 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1220 transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid)); 1226 transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
1221 if (!transfer_to[GRPQUOTA]) { 1227 if (IS_ERR(transfer_to[GRPQUOTA])) {
1222 status = -ESRCH; 1228 status = PTR_ERR(transfer_to[GRPQUOTA]);
1223 goto bail_unlock; 1229 goto bail_unlock;
1224 } 1230 }
1225 } 1231 }
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 6e6abb93fda5..948681e37cfd 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -200,11 +200,12 @@ bail:
200static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) 200static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
201{ 201{
202 struct inode *inode; 202 struct inode *inode;
203 int status;
203 204
204 inode = new_inode(dir->i_sb); 205 inode = new_inode(dir->i_sb);
205 if (!inode) { 206 if (!inode) {
206 mlog(ML_ERROR, "new_inode failed!\n"); 207 mlog(ML_ERROR, "new_inode failed!\n");
207 return NULL; 208 return ERR_PTR(-ENOMEM);
208 } 209 }
209 210
210 /* populate as many fields early on as possible - many of 211 /* populate as many fields early on as possible - many of
@@ -213,7 +214,10 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
213 if (S_ISDIR(mode)) 214 if (S_ISDIR(mode))
214 set_nlink(inode, 2); 215 set_nlink(inode, 2);
215 inode_init_owner(inode, dir, mode); 216 inode_init_owner(inode, dir, mode);
216 dquot_initialize(inode); 217 status = dquot_initialize(inode);
218 if (status)
219 return ERR_PTR(status);
220
217 return inode; 221 return inode;
218} 222}
219 223
@@ -264,7 +268,11 @@ static int ocfs2_mknod(struct inode *dir,
264 (unsigned long long)OCFS2_I(dir)->ip_blkno, 268 (unsigned long long)OCFS2_I(dir)->ip_blkno,
265 (unsigned long)dev, mode); 269 (unsigned long)dev, mode);
266 270
267 dquot_initialize(dir); 271 status = dquot_initialize(dir);
272 if (status) {
273 mlog_errno(status);
274 return status;
275 }
268 276
269 /* get our super block */ 277 /* get our super block */
270 osb = OCFS2_SB(dir->i_sb); 278 osb = OCFS2_SB(dir->i_sb);
@@ -311,8 +319,9 @@ static int ocfs2_mknod(struct inode *dir,
311 } 319 }
312 320
313 inode = ocfs2_get_init_inode(dir, mode); 321 inode = ocfs2_get_init_inode(dir, mode);
314 if (!inode) { 322 if (IS_ERR(inode)) {
315 status = -ENOMEM; 323 status = PTR_ERR(inode);
324 inode = NULL;
316 mlog_errno(status); 325 mlog_errno(status);
317 goto leave; 326 goto leave;
318 } 327 }
@@ -708,7 +717,11 @@ static int ocfs2_link(struct dentry *old_dentry,
708 if (S_ISDIR(inode->i_mode)) 717 if (S_ISDIR(inode->i_mode))
709 return -EPERM; 718 return -EPERM;
710 719
711 dquot_initialize(dir); 720 err = dquot_initialize(dir);
721 if (err) {
722 mlog_errno(err);
723 return err;
724 }
712 725
713 err = ocfs2_double_lock(osb, &old_dir_bh, old_dir, 726 err = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
714 &parent_fe_bh, dir, 0); 727 &parent_fe_bh, dir, 0);
@@ -896,7 +909,11 @@ static int ocfs2_unlink(struct inode *dir,
896 (unsigned long long)OCFS2_I(dir)->ip_blkno, 909 (unsigned long long)OCFS2_I(dir)->ip_blkno,
897 (unsigned long long)OCFS2_I(inode)->ip_blkno); 910 (unsigned long long)OCFS2_I(inode)->ip_blkno);
898 911
899 dquot_initialize(dir); 912 status = dquot_initialize(dir);
913 if (status) {
914 mlog_errno(status);
915 return status;
916 }
900 917
901 BUG_ON(d_inode(dentry->d_parent) != dir); 918 BUG_ON(d_inode(dentry->d_parent) != dir);
902 919
@@ -1230,8 +1247,16 @@ static int ocfs2_rename(struct inode *old_dir,
1230 old_dentry->d_name.len, old_dentry->d_name.name, 1247 old_dentry->d_name.len, old_dentry->d_name.name,
1231 new_dentry->d_name.len, new_dentry->d_name.name); 1248 new_dentry->d_name.len, new_dentry->d_name.name);
1232 1249
1233 dquot_initialize(old_dir); 1250 status = dquot_initialize(old_dir);
1234 dquot_initialize(new_dir); 1251 if (status) {
1252 mlog_errno(status);
1253 goto bail;
1254 }
1255 status = dquot_initialize(new_dir);
1256 if (status) {
1257 mlog_errno(status);
1258 goto bail;
1259 }
1235 1260
1236 osb = OCFS2_SB(old_dir->i_sb); 1261 osb = OCFS2_SB(old_dir->i_sb);
1237 1262
@@ -1786,7 +1811,11 @@ static int ocfs2_symlink(struct inode *dir,
1786 trace_ocfs2_symlink_begin(dir, dentry, symname, 1811 trace_ocfs2_symlink_begin(dir, dentry, symname,
1787 dentry->d_name.len, dentry->d_name.name); 1812 dentry->d_name.len, dentry->d_name.name);
1788 1813
1789 dquot_initialize(dir); 1814 status = dquot_initialize(dir);
1815 if (status) {
1816 mlog_errno(status);
1817 goto bail;
1818 }
1790 1819
1791 sb = dir->i_sb; 1820 sb = dir->i_sb;
1792 osb = OCFS2_SB(sb); 1821 osb = OCFS2_SB(sb);
@@ -1831,8 +1860,9 @@ static int ocfs2_symlink(struct inode *dir,
1831 } 1860 }
1832 1861
1833 inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO); 1862 inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
1834 if (!inode) { 1863 if (IS_ERR(inode)) {
1835 status = -ENOMEM; 1864 status = PTR_ERR(inode);
1865 inode = NULL;
1836 mlog_errno(status); 1866 mlog_errno(status);
1837 goto bail; 1867 goto bail;
1838 } 1868 }
@@ -2485,8 +2515,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2485 } 2515 }
2486 2516
2487 inode = ocfs2_get_init_inode(dir, mode); 2517 inode = ocfs2_get_init_inode(dir, mode);
2488 if (!inode) { 2518 if (IS_ERR(inode)) {
2489 status = -ENOMEM; 2519 status = PTR_ERR(inode);
2520 inode = NULL;
2490 mlog_errno(status); 2521 mlog_errno(status);
2491 goto leave; 2522 goto leave;
2492 } 2523 }
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 3d0b63d34225..bb07004df72a 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -499,8 +499,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
499 dquot = dqget(sb, 499 dquot = dqget(sb,
500 make_kqid(&init_user_ns, type, 500 make_kqid(&init_user_ns, type,
501 le64_to_cpu(dqblk->dqb_id))); 501 le64_to_cpu(dqblk->dqb_id)));
502 if (!dquot) { 502 if (IS_ERR(dquot)) {
503 status = -EIO; 503 status = PTR_ERR(dquot);
504 mlog(ML_ERROR, "Failed to get quota structure " 504 mlog(ML_ERROR, "Failed to get quota structure "
505 "for id %u, type %d. Cannot finish quota " 505 "for id %u, type %d. Cannot finish quota "
506 "file recovery.\n", 506 "file recovery.\n",
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b69dd14c0b9b..7dc818b87cd8 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4419,8 +4419,9 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
4419 } 4419 }
4420 4420
4421 mutex_lock(&inode->i_mutex); 4421 mutex_lock(&inode->i_mutex);
4422 dquot_initialize(dir); 4422 error = dquot_initialize(dir);
4423 error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); 4423 if (!error)
4424 error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
4424 mutex_unlock(&inode->i_mutex); 4425 mutex_unlock(&inode->i_mutex);
4425 if (!error) 4426 if (!error)
4426 fsnotify_create(dir, new_dentry); 4427 fsnotify_create(dir, new_dentry);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 20d1f74561cf..fed66e2c9fe8 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -247,7 +247,7 @@ struct dqstats dqstats;
247EXPORT_SYMBOL(dqstats); 247EXPORT_SYMBOL(dqstats);
248 248
249static qsize_t inode_get_rsv_space(struct inode *inode); 249static qsize_t inode_get_rsv_space(struct inode *inode);
250static void __dquot_initialize(struct inode *inode, int type); 250static int __dquot_initialize(struct inode *inode, int type);
251 251
252static inline unsigned int 252static inline unsigned int
253hashfn(const struct super_block *sb, struct kqid qid) 253hashfn(const struct super_block *sb, struct kqid qid)
@@ -832,16 +832,17 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
832struct dquot *dqget(struct super_block *sb, struct kqid qid) 832struct dquot *dqget(struct super_block *sb, struct kqid qid)
833{ 833{
834 unsigned int hashent = hashfn(sb, qid); 834 unsigned int hashent = hashfn(sb, qid);
835 struct dquot *dquot = NULL, *empty = NULL; 835 struct dquot *dquot, *empty = NULL;
836 836
837 if (!sb_has_quota_active(sb, qid.type)) 837 if (!sb_has_quota_active(sb, qid.type))
838 return NULL; 838 return ERR_PTR(-ESRCH);
839we_slept: 839we_slept:
840 spin_lock(&dq_list_lock); 840 spin_lock(&dq_list_lock);
841 spin_lock(&dq_state_lock); 841 spin_lock(&dq_state_lock);
842 if (!sb_has_quota_active(sb, qid.type)) { 842 if (!sb_has_quota_active(sb, qid.type)) {
843 spin_unlock(&dq_state_lock); 843 spin_unlock(&dq_state_lock);
844 spin_unlock(&dq_list_lock); 844 spin_unlock(&dq_list_lock);
845 dquot = ERR_PTR(-ESRCH);
845 goto out; 846 goto out;
846 } 847 }
847 spin_unlock(&dq_state_lock); 848 spin_unlock(&dq_state_lock);
@@ -876,11 +877,15 @@ we_slept:
876 * already finished or it will be canceled due to dq_count > 1 test */ 877 * already finished or it will be canceled due to dq_count > 1 test */
877 wait_on_dquot(dquot); 878 wait_on_dquot(dquot);
878 /* Read the dquot / allocate space in quota file */ 879 /* Read the dquot / allocate space in quota file */
879 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && 880 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
880 sb->dq_op->acquire_dquot(dquot) < 0) { 881 int err;
881 dqput(dquot); 882
882 dquot = NULL; 883 err = sb->dq_op->acquire_dquot(dquot);
883 goto out; 884 if (err < 0) {
885 dqput(dquot);
886 dquot = ERR_PTR(err);
887 goto out;
888 }
884 } 889 }
885#ifdef CONFIG_QUOTA_DEBUG 890#ifdef CONFIG_QUOTA_DEBUG
886 BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ 891 BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */
@@ -1390,15 +1395,16 @@ static int dquot_active(const struct inode *inode)
1390 * It is better to call this function outside of any transaction as it 1395 * It is better to call this function outside of any transaction as it
1391 * might need a lot of space in journal for dquot structure allocation. 1396 * might need a lot of space in journal for dquot structure allocation.
1392 */ 1397 */
1393static void __dquot_initialize(struct inode *inode, int type) 1398static int __dquot_initialize(struct inode *inode, int type)
1394{ 1399{
1395 int cnt, init_needed = 0; 1400 int cnt, init_needed = 0;
1396 struct dquot **dquots, *got[MAXQUOTAS]; 1401 struct dquot **dquots, *got[MAXQUOTAS];
1397 struct super_block *sb = inode->i_sb; 1402 struct super_block *sb = inode->i_sb;
1398 qsize_t rsv; 1403 qsize_t rsv;
1404 int ret = 0;
1399 1405
1400 if (!dquot_active(inode)) 1406 if (!dquot_active(inode))
1401 return; 1407 return 0;
1402 1408
1403 dquots = i_dquot(inode); 1409 dquots = i_dquot(inode);
1404 1410
@@ -1407,6 +1413,7 @@ static void __dquot_initialize(struct inode *inode, int type)
1407 struct kqid qid; 1413 struct kqid qid;
1408 kprojid_t projid; 1414 kprojid_t projid;
1409 int rc; 1415 int rc;
1416 struct dquot *dquot;
1410 1417
1411 got[cnt] = NULL; 1418 got[cnt] = NULL;
1412 if (type != -1 && cnt != type) 1419 if (type != -1 && cnt != type)
@@ -1438,16 +1445,25 @@ static void __dquot_initialize(struct inode *inode, int type)
1438 qid = make_kqid_projid(projid); 1445 qid = make_kqid_projid(projid);
1439 break; 1446 break;
1440 } 1447 }
1441 got[cnt] = dqget(sb, qid); 1448 dquot = dqget(sb, qid);
1449 if (IS_ERR(dquot)) {
1450 /* We raced with somebody turning quotas off... */
1451 if (PTR_ERR(dquot) != -ESRCH) {
1452 ret = PTR_ERR(dquot);
1453 goto out_put;
1454 }
1455 dquot = NULL;
1456 }
1457 got[cnt] = dquot;
1442 } 1458 }
1443 1459
1444 /* All required i_dquot has been initialized */ 1460 /* All required i_dquot has been initialized */
1445 if (!init_needed) 1461 if (!init_needed)
1446 return; 1462 return 0;
1447 1463
1448 spin_lock(&dq_data_lock); 1464 spin_lock(&dq_data_lock);
1449 if (IS_NOQUOTA(inode)) 1465 if (IS_NOQUOTA(inode))
1450 goto out_err; 1466 goto out_lock;
1451 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1467 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1452 if (type != -1 && cnt != type) 1468 if (type != -1 && cnt != type)
1453 continue; 1469 continue;
@@ -1469,15 +1485,18 @@ static void __dquot_initialize(struct inode *inode, int type)
1469 dquot_resv_space(dquots[cnt], rsv); 1485 dquot_resv_space(dquots[cnt], rsv);
1470 } 1486 }
1471 } 1487 }
1472out_err: 1488out_lock:
1473 spin_unlock(&dq_data_lock); 1489 spin_unlock(&dq_data_lock);
1490out_put:
1474 /* Drop unused references */ 1491 /* Drop unused references */
1475 dqput_all(got); 1492 dqput_all(got);
1493
1494 return ret;
1476} 1495}
1477 1496
1478void dquot_initialize(struct inode *inode) 1497int dquot_initialize(struct inode *inode)
1479{ 1498{
1480 __dquot_initialize(inode, -1); 1499 return __dquot_initialize(inode, -1);
1481} 1500}
1482EXPORT_SYMBOL(dquot_initialize); 1501EXPORT_SYMBOL(dquot_initialize);
1483 1502
@@ -1961,18 +1980,37 @@ EXPORT_SYMBOL(__dquot_transfer);
1961int dquot_transfer(struct inode *inode, struct iattr *iattr) 1980int dquot_transfer(struct inode *inode, struct iattr *iattr)
1962{ 1981{
1963 struct dquot *transfer_to[MAXQUOTAS] = {}; 1982 struct dquot *transfer_to[MAXQUOTAS] = {};
1983 struct dquot *dquot;
1964 struct super_block *sb = inode->i_sb; 1984 struct super_block *sb = inode->i_sb;
1965 int ret; 1985 int ret;
1966 1986
1967 if (!dquot_active(inode)) 1987 if (!dquot_active(inode))
1968 return 0; 1988 return 0;
1969 1989
1970 if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) 1990 if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){
1971 transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(iattr->ia_uid)); 1991 dquot = dqget(sb, make_kqid_uid(iattr->ia_uid));
1972 if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)) 1992 if (IS_ERR(dquot)) {
1973 transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(iattr->ia_gid)); 1993 if (PTR_ERR(dquot) != -ESRCH) {
1974 1994 ret = PTR_ERR(dquot);
1995 goto out_put;
1996 }
1997 dquot = NULL;
1998 }
1999 transfer_to[USRQUOTA] = dquot;
2000 }
2001 if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)){
2002 dquot = dqget(sb, make_kqid_gid(iattr->ia_gid));
2003 if (IS_ERR(dquot)) {
2004 if (PTR_ERR(dquot) != -ESRCH) {
2005 ret = PTR_ERR(dquot);
2006 goto out_put;
2007 }
2008 dquot = NULL;
2009 }
2010 transfer_to[GRPQUOTA] = dquot;
2011 }
1975 ret = __dquot_transfer(inode, transfer_to); 2012 ret = __dquot_transfer(inode, transfer_to);
2013out_put:
1976 dqput_all(transfer_to); 2014 dqput_all(transfer_to);
1977 return ret; 2015 return ret;
1978} 2016}
@@ -2518,8 +2556,8 @@ int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
2518 struct dquot *dquot; 2556 struct dquot *dquot;
2519 2557
2520 dquot = dqget(sb, qid); 2558 dquot = dqget(sb, qid);
2521 if (!dquot) 2559 if (IS_ERR(dquot))
2522 return -ESRCH; 2560 return PTR_ERR(dquot);
2523 do_get_dqblk(dquot, di); 2561 do_get_dqblk(dquot, di);
2524 dqput(dquot); 2562 dqput(dquot);
2525 2563
@@ -2631,8 +2669,8 @@ int dquot_set_dqblk(struct super_block *sb, struct kqid qid,
2631 int rc; 2669 int rc;
2632 2670
2633 dquot = dqget(sb, qid); 2671 dquot = dqget(sb, qid);
2634 if (!dquot) { 2672 if (IS_ERR(dquot)) {
2635 rc = -ESRCH; 2673 rc = PTR_ERR(dquot);
2636 goto out; 2674 goto out;
2637 } 2675 }
2638 rc = do_set_dqblk(dquot, di); 2676 rc = do_set_dqblk(dquot, di);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 86ded7375c21..3746367098fd 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -141,9 +141,9 @@ static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
141 if (tstate->flags & QCI_ROOT_SQUASH) 141 if (tstate->flags & QCI_ROOT_SQUASH)
142 uinfo.dqi_flags |= DQF_ROOT_SQUASH; 142 uinfo.dqi_flags |= DQF_ROOT_SQUASH;
143 uinfo.dqi_valid = IIF_ALL; 143 uinfo.dqi_valid = IIF_ALL;
144 if (!ret && copy_to_user(addr, &uinfo, sizeof(uinfo))) 144 if (copy_to_user(addr, &uinfo, sizeof(uinfo)))
145 return -EFAULT; 145 return -EFAULT;
146 return ret; 146 return 0;
147} 147}
148 148
149static int quota_setinfo(struct super_block *sb, int type, void __user *addr) 149static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f6f2fbad9777..3d8e7e671d5b 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3319,8 +3319,11 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3319 /* must be turned off for recursive notify_change calls */ 3319 /* must be turned off for recursive notify_change calls */
3320 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); 3320 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
3321 3321
3322 if (is_quota_modification(inode, attr)) 3322 if (is_quota_modification(inode, attr)) {
3323 dquot_initialize(inode); 3323 error = dquot_initialize(inode);
3324 if (error)
3325 return error;
3326 }
3324 reiserfs_write_lock(inode->i_sb); 3327 reiserfs_write_lock(inode->i_sb);
3325 if (attr->ia_valid & ATTR_SIZE) { 3328 if (attr->ia_valid & ATTR_SIZE) {
3326 /* 3329 /*
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index b55a074653d7..5f1c9c29eb8c 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -613,8 +613,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode)
613 * we have to set uid and gid here 613 * we have to set uid and gid here
614 */ 614 */
615 inode_init_owner(inode, dir, mode); 615 inode_init_owner(inode, dir, mode);
616 dquot_initialize(inode); 616 return dquot_initialize(inode);
617 return 0;
618} 617}
619 618
620static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, 619static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
@@ -633,12 +632,18 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod
633 struct reiserfs_transaction_handle th; 632 struct reiserfs_transaction_handle th;
634 struct reiserfs_security_handle security; 633 struct reiserfs_security_handle security;
635 634
636 dquot_initialize(dir); 635 retval = dquot_initialize(dir);
636 if (retval)
637 return retval;
637 638
638 if (!(inode = new_inode(dir->i_sb))) { 639 if (!(inode = new_inode(dir->i_sb))) {
639 return -ENOMEM; 640 return -ENOMEM;
640 } 641 }
641 new_inode_init(inode, dir, mode); 642 retval = new_inode_init(inode, dir, mode);
643 if (retval) {
644 drop_new_inode(inode);
645 return retval;
646 }
642 647
643 jbegin_count += reiserfs_cache_default_acl(dir); 648 jbegin_count += reiserfs_cache_default_acl(dir);
644 retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); 649 retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
@@ -710,12 +715,18 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
710 if (!new_valid_dev(rdev)) 715 if (!new_valid_dev(rdev))
711 return -EINVAL; 716 return -EINVAL;
712 717
713 dquot_initialize(dir); 718 retval = dquot_initialize(dir);
719 if (retval)
720 return retval;
714 721
715 if (!(inode = new_inode(dir->i_sb))) { 722 if (!(inode = new_inode(dir->i_sb))) {
716 return -ENOMEM; 723 return -ENOMEM;
717 } 724 }
718 new_inode_init(inode, dir, mode); 725 retval = new_inode_init(inode, dir, mode);
726 if (retval) {
727 drop_new_inode(inode);
728 return retval;
729 }
719 730
720 jbegin_count += reiserfs_cache_default_acl(dir); 731 jbegin_count += reiserfs_cache_default_acl(dir);
721 retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); 732 retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
@@ -787,7 +798,9 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
787 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + 798 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
788 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); 799 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
789 800
790 dquot_initialize(dir); 801 retval = dquot_initialize(dir);
802 if (retval)
803 return retval;
791 804
792#ifdef DISPLACE_NEW_PACKING_LOCALITIES 805#ifdef DISPLACE_NEW_PACKING_LOCALITIES
793 /* 806 /*
@@ -800,7 +813,11 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
800 if (!(inode = new_inode(dir->i_sb))) { 813 if (!(inode = new_inode(dir->i_sb))) {
801 return -ENOMEM; 814 return -ENOMEM;
802 } 815 }
803 new_inode_init(inode, dir, mode); 816 retval = new_inode_init(inode, dir, mode);
817 if (retval) {
818 drop_new_inode(inode);
819 return retval;
820 }
804 821
805 jbegin_count += reiserfs_cache_default_acl(dir); 822 jbegin_count += reiserfs_cache_default_acl(dir);
806 retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); 823 retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
@@ -899,7 +916,9 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
899 JOURNAL_PER_BALANCE_CNT * 2 + 2 + 916 JOURNAL_PER_BALANCE_CNT * 2 + 2 +
900 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); 917 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
901 918
902 dquot_initialize(dir); 919 retval = dquot_initialize(dir);
920 if (retval)
921 return retval;
903 922
904 reiserfs_write_lock(dir->i_sb); 923 reiserfs_write_lock(dir->i_sb);
905 retval = journal_begin(&th, dir->i_sb, jbegin_count); 924 retval = journal_begin(&th, dir->i_sb, jbegin_count);
@@ -985,7 +1004,9 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
985 int jbegin_count; 1004 int jbegin_count;
986 unsigned long savelink; 1005 unsigned long savelink;
987 1006
988 dquot_initialize(dir); 1007 retval = dquot_initialize(dir);
1008 if (retval)
1009 return retval;
989 1010
990 inode = d_inode(dentry); 1011 inode = d_inode(dentry);
991 1012
@@ -1095,12 +1116,18 @@ static int reiserfs_symlink(struct inode *parent_dir,
1095 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + 1116 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
1096 REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb)); 1117 REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
1097 1118
1098 dquot_initialize(parent_dir); 1119 retval = dquot_initialize(parent_dir);
1120 if (retval)
1121 return retval;
1099 1122
1100 if (!(inode = new_inode(parent_dir->i_sb))) { 1123 if (!(inode = new_inode(parent_dir->i_sb))) {
1101 return -ENOMEM; 1124 return -ENOMEM;
1102 } 1125 }
1103 new_inode_init(inode, parent_dir, mode); 1126 retval = new_inode_init(inode, parent_dir, mode);
1127 if (retval) {
1128 drop_new_inode(inode);
1129 return retval;
1130 }
1104 1131
1105 retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name, 1132 retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name,
1106 &security); 1133 &security);
@@ -1184,7 +1211,9 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
1184 JOURNAL_PER_BALANCE_CNT * 3 + 1211 JOURNAL_PER_BALANCE_CNT * 3 +
1185 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); 1212 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
1186 1213
1187 dquot_initialize(dir); 1214 retval = dquot_initialize(dir);
1215 if (retval)
1216 return retval;
1188 1217
1189 reiserfs_write_lock(dir->i_sb); 1218 reiserfs_write_lock(dir->i_sb);
1190 if (inode->i_nlink >= REISERFS_LINK_MAX) { 1219 if (inode->i_nlink >= REISERFS_LINK_MAX) {
@@ -1308,8 +1337,12 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1308 JOURNAL_PER_BALANCE_CNT * 3 + 5 + 1337 JOURNAL_PER_BALANCE_CNT * 3 + 5 +
1309 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); 1338 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
1310 1339
1311 dquot_initialize(old_dir); 1340 retval = dquot_initialize(old_dir);
1312 dquot_initialize(new_dir); 1341 if (retval)
1342 return retval;
1343 retval = dquot_initialize(new_dir);
1344 if (retval)
1345 return retval;
1313 1346
1314 old_inode = d_inode(old_dentry); 1347 old_inode = d_inode(old_dentry);
1315 new_dentry_inode = d_inode(new_dentry); 1348 new_dentry_inode = d_inode(new_dentry);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index b96f190bc567..81155b9b445b 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -2070,6 +2070,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2070 struct udf_options uopt; 2070 struct udf_options uopt;
2071 struct kernel_lb_addr rootdir, fileset; 2071 struct kernel_lb_addr rootdir, fileset;
2072 struct udf_sb_info *sbi; 2072 struct udf_sb_info *sbi;
2073 bool lvid_open = false;
2073 2074
2074 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); 2075 uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
2075 uopt.uid = INVALID_UID; 2076 uopt.uid = INVALID_UID;
@@ -2216,8 +2217,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2216 le16_to_cpu(ts.year), ts.month, ts.day, 2217 le16_to_cpu(ts.year), ts.month, ts.day,
2217 ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone)); 2218 ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone));
2218 } 2219 }
2219 if (!(sb->s_flags & MS_RDONLY)) 2220 if (!(sb->s_flags & MS_RDONLY)) {
2220 udf_open_lvid(sb); 2221 udf_open_lvid(sb);
2222 lvid_open = true;
2223 }
2221 2224
2222 /* Assign the root inode */ 2225 /* Assign the root inode */
2223 /* assign inodes by physical block number */ 2226 /* assign inodes by physical block number */
@@ -2248,7 +2251,7 @@ parse_options_failure:
2248 if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) 2251 if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
2249 unload_nls(sbi->s_nls_map); 2252 unload_nls(sbi->s_nls_map);
2250#endif 2253#endif
2251 if (!(sb->s_flags & MS_RDONLY)) 2254 if (lvid_open)
2252 udf_close_lvid(sb); 2255 udf_close_lvid(sb);
2253 brelse(sbi->s_lvid_bh); 2256 brelse(sbi->s_lvid_bh);
2254 udf_sb_free_partitions(sb); 2257 udf_sb_free_partitions(sb);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4b7b4ebaa633..e8130138f29d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -118,9 +118,8 @@ struct bio {
118#define BIO_USER_MAPPED 4 /* contains user pages */ 118#define BIO_USER_MAPPED 4 /* contains user pages */
119#define BIO_NULL_MAPPED 5 /* contains invalid user pages */ 119#define BIO_NULL_MAPPED 5 /* contains invalid user pages */
120#define BIO_QUIET 6 /* Make BIO Quiet */ 120#define BIO_QUIET 6 /* Make BIO Quiet */
121#define BIO_SNAP_STABLE 7 /* bio data must be snapshotted during write */ 121#define BIO_CHAIN 7 /* chained bio, ->bi_remaining in effect */
122#define BIO_CHAIN 8 /* chained bio, ->bi_remaining in effect */ 122#define BIO_REFFED 8 /* bio has elevated ->bi_cnt */
123#define BIO_REFFED 9 /* bio has elevated ->bi_cnt */
124 123
125/* 124/*
126 * Flags starting here get preserved by bio_reset() - this includes 125 * Flags starting here get preserved by bio_reset() - this includes
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
deleted file mode 100644
index d32615280be9..000000000000
--- a/include/linux/jbd.h
+++ /dev/null
@@ -1,1047 +0,0 @@
1/*
2 * linux/include/linux/jbd.h
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>
5 *
6 * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Definitions for transaction data structures for the buffer cache
13 * filesystem journaling support.
14 */
15
16#ifndef _LINUX_JBD_H
17#define _LINUX_JBD_H
18
19/* Allow this file to be included directly into e2fsprogs */
20#ifndef __KERNEL__
21#include "jfs_compat.h"
22#define JFS_DEBUG
23#define jfs_debug jbd_debug
24#else
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include <linux/journal-head.h>
29#include <linux/stddef.h>
30#include <linux/mutex.h>
31#include <linux/timer.h>
32#include <linux/lockdep.h>
33#include <linux/slab.h>
34
35#define journal_oom_retry 1
36
37/*
38 * Define JBD_PARANOID_IOFAIL to cause a kernel BUG() if ext3 finds
39 * certain classes of error which can occur due to failed IOs. Under
40 * normal use we want ext3 to continue after such errors, because
41 * hardware _can_ fail, but for debugging purposes when running tests on
42 * known-good hardware we may want to trap these errors.
43 */
44#undef JBD_PARANOID_IOFAIL
45
46/*
47 * The default maximum commit age, in seconds.
48 */
49#define JBD_DEFAULT_MAX_COMMIT_AGE 5
50
51#ifdef CONFIG_JBD_DEBUG
52/*
53 * Define JBD_EXPENSIVE_CHECKING to enable more expensive internal
54 * consistency checks. By default we don't do this unless
55 * CONFIG_JBD_DEBUG is on.
56 */
57#define JBD_EXPENSIVE_CHECKING
58extern u8 journal_enable_debug;
59
60void __jbd_debug(int level, const char *file, const char *func,
61 unsigned int line, const char *fmt, ...);
62
63#define jbd_debug(n, fmt, a...) \
64 __jbd_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a)
65#else
66#define jbd_debug(n, fmt, a...) /**/
67#endif
68
69static inline void *jbd_alloc(size_t size, gfp_t flags)
70{
71 return (void *)__get_free_pages(flags, get_order(size));
72}
73
74static inline void jbd_free(void *ptr, size_t size)
75{
76 free_pages((unsigned long)ptr, get_order(size));
77}
78
79#define JFS_MIN_JOURNAL_BLOCKS 1024
80
81
82/**
83 * typedef handle_t - The handle_t type represents a single atomic update being performed by some process.
84 *
85 * All filesystem modifications made by the process go
86 * through this handle. Recursive operations (such as quota operations)
87 * are gathered into a single update.
88 *
89 * The buffer credits field is used to account for journaled buffers
90 * being modified by the running process. To ensure that there is
91 * enough log space for all outstanding operations, we need to limit the
92 * number of outstanding buffers possible at any time. When the
93 * operation completes, any buffer credits not used are credited back to
94 * the transaction, so that at all times we know how many buffers the
95 * outstanding updates on a transaction might possibly touch.
96 *
97 * This is an opaque datatype.
98 **/
99typedef struct handle_s handle_t; /* Atomic operation type */
100
101
102/**
103 * typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem.
104 *
105 * journal_t is linked to from the fs superblock structure.
106 *
107 * We use the journal_t to keep track of all outstanding transaction
108 * activity on the filesystem, and to manage the state of the log
109 * writing process.
110 *
111 * This is an opaque datatype.
112 **/
113typedef struct journal_s journal_t; /* Journal control structure */
114#endif
115
116/*
117 * Internal structures used by the logging mechanism:
118 */
119
120#define JFS_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */
121
122/*
123 * On-disk structures
124 */
125
126/*
127 * Descriptor block types:
128 */
129
130#define JFS_DESCRIPTOR_BLOCK 1
131#define JFS_COMMIT_BLOCK 2
132#define JFS_SUPERBLOCK_V1 3
133#define JFS_SUPERBLOCK_V2 4
134#define JFS_REVOKE_BLOCK 5
135
136/*
137 * Standard header for all descriptor blocks:
138 */
139typedef struct journal_header_s
140{
141 __be32 h_magic;
142 __be32 h_blocktype;
143 __be32 h_sequence;
144} journal_header_t;
145
146
147/*
148 * The block tag: used to describe a single buffer in the journal
149 */
150typedef struct journal_block_tag_s
151{
152 __be32 t_blocknr; /* The on-disk block number */
153 __be32 t_flags; /* See below */
154} journal_block_tag_t;
155
156/*
157 * The revoke descriptor: used on disk to describe a series of blocks to
158 * be revoked from the log
159 */
160typedef struct journal_revoke_header_s
161{
162 journal_header_t r_header;
163 __be32 r_count; /* Count of bytes used in the block */
164} journal_revoke_header_t;
165
166
167/* Definitions for the journal tag flags word: */
168#define JFS_FLAG_ESCAPE 1 /* on-disk block is escaped */
169#define JFS_FLAG_SAME_UUID 2 /* block has same uuid as previous */
170#define JFS_FLAG_DELETED 4 /* block deleted by this transaction */
171#define JFS_FLAG_LAST_TAG 8 /* last tag in this descriptor block */
172
173
174/*
175 * The journal superblock. All fields are in big-endian byte order.
176 */
177typedef struct journal_superblock_s
178{
179/* 0x0000 */
180 journal_header_t s_header;
181
182/* 0x000C */
183 /* Static information describing the journal */
184 __be32 s_blocksize; /* journal device blocksize */
185 __be32 s_maxlen; /* total blocks in journal file */
186 __be32 s_first; /* first block of log information */
187
188/* 0x0018 */
189 /* Dynamic information describing the current state of the log */
190 __be32 s_sequence; /* first commit ID expected in log */
191 __be32 s_start; /* blocknr of start of log */
192
193/* 0x0020 */
194 /* Error value, as set by journal_abort(). */
195 __be32 s_errno;
196
197/* 0x0024 */
198 /* Remaining fields are only valid in a version-2 superblock */
199 __be32 s_feature_compat; /* compatible feature set */
200 __be32 s_feature_incompat; /* incompatible feature set */
201 __be32 s_feature_ro_compat; /* readonly-compatible feature set */
202/* 0x0030 */
203 __u8 s_uuid[16]; /* 128-bit uuid for journal */
204
205/* 0x0040 */
206 __be32 s_nr_users; /* Nr of filesystems sharing log */
207
208 __be32 s_dynsuper; /* Blocknr of dynamic superblock copy*/
209
210/* 0x0048 */
211 __be32 s_max_transaction; /* Limit of journal blocks per trans.*/
212 __be32 s_max_trans_data; /* Limit of data blocks per trans. */
213
214/* 0x0050 */
215 __u32 s_padding[44];
216
217/* 0x0100 */
218 __u8 s_users[16*48]; /* ids of all fs'es sharing the log */
219/* 0x0400 */
220} journal_superblock_t;
221
222#define JFS_HAS_COMPAT_FEATURE(j,mask) \
223 ((j)->j_format_version >= 2 && \
224 ((j)->j_superblock->s_feature_compat & cpu_to_be32((mask))))
225#define JFS_HAS_RO_COMPAT_FEATURE(j,mask) \
226 ((j)->j_format_version >= 2 && \
227 ((j)->j_superblock->s_feature_ro_compat & cpu_to_be32((mask))))
228#define JFS_HAS_INCOMPAT_FEATURE(j,mask) \
229 ((j)->j_format_version >= 2 && \
230 ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
231
232#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
233
234/* Features known to this kernel version: */
235#define JFS_KNOWN_COMPAT_FEATURES 0
236#define JFS_KNOWN_ROCOMPAT_FEATURES 0
237#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE
238
239#ifdef __KERNEL__
240
241#include <linux/fs.h>
242#include <linux/sched.h>
243
244enum jbd_state_bits {
245 BH_JBD /* Has an attached ext3 journal_head */
246 = BH_PrivateStart,
247 BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
248 BH_Freed, /* Has been freed (truncated) */
249 BH_Revoked, /* Has been revoked from the log */
250 BH_RevokeValid, /* Revoked flag is valid */
251 BH_JBDDirty, /* Is dirty but journaled */
252 BH_State, /* Pins most journal_head state */
253 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
254 BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
255 BH_JBDPrivateStart, /* First bit available for private use by FS */
256};
257
258BUFFER_FNS(JBD, jbd)
259BUFFER_FNS(JWrite, jwrite)
260BUFFER_FNS(JBDDirty, jbddirty)
261TAS_BUFFER_FNS(JBDDirty, jbddirty)
262BUFFER_FNS(Revoked, revoked)
263TAS_BUFFER_FNS(Revoked, revoked)
264BUFFER_FNS(RevokeValid, revokevalid)
265TAS_BUFFER_FNS(RevokeValid, revokevalid)
266BUFFER_FNS(Freed, freed)
267
268#include <linux/jbd_common.h>
269
270#define J_ASSERT(assert) BUG_ON(!(assert))
271
272#define J_ASSERT_BH(bh, expr) J_ASSERT(expr)
273#define J_ASSERT_JH(jh, expr) J_ASSERT(expr)
274
275#if defined(JBD_PARANOID_IOFAIL)
276#define J_EXPECT(expr, why...) J_ASSERT(expr)
277#define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr)
278#define J_EXPECT_JH(jh, expr, why...) J_ASSERT_JH(jh, expr)
279#else
280#define __journal_expect(expr, why...) \
281 ({ \
282 int val = (expr); \
283 if (!val) { \
284 printk(KERN_ERR \
285 "EXT3-fs unexpected failure: %s;\n",# expr); \
286 printk(KERN_ERR why "\n"); \
287 } \
288 val; \
289 })
290#define J_EXPECT(expr, why...) __journal_expect(expr, ## why)
291#define J_EXPECT_BH(bh, expr, why...) __journal_expect(expr, ## why)
292#define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why)
293#endif
294
295struct jbd_revoke_table_s;
296
297/**
298 * struct handle_s - this is the concrete type associated with handle_t.
299 * @h_transaction: Which compound transaction is this update a part of?
300 * @h_buffer_credits: Number of remaining buffers we are allowed to dirty.
301 * @h_ref: Reference count on this handle
302 * @h_err: Field for caller's use to track errors through large fs operations
303 * @h_sync: flag for sync-on-close
304 * @h_jdata: flag to force data journaling
305 * @h_aborted: flag indicating fatal error on handle
306 * @h_lockdep_map: lockdep info for debugging lock problems
307 */
308struct handle_s
309{
310 /* Which compound transaction is this update a part of? */
311 transaction_t *h_transaction;
312
313 /* Number of remaining buffers we are allowed to dirty: */
314 int h_buffer_credits;
315
316 /* Reference count on this handle */
317 int h_ref;
318
319 /* Field for caller's use to track errors through large fs */
320 /* operations */
321 int h_err;
322
323 /* Flags [no locking] */
324 unsigned int h_sync: 1; /* sync-on-close */
325 unsigned int h_jdata: 1; /* force data journaling */
326 unsigned int h_aborted: 1; /* fatal error on handle */
327
328#ifdef CONFIG_DEBUG_LOCK_ALLOC
329 struct lockdep_map h_lockdep_map;
330#endif
331};
332
333
334/* The transaction_t type is the guts of the journaling mechanism. It
335 * tracks a compound transaction through its various states:
336 *
337 * RUNNING: accepting new updates
338 * LOCKED: Updates still running but we don't accept new ones
339 * RUNDOWN: Updates are tidying up but have finished requesting
340 * new buffers to modify (state not used for now)
341 * FLUSH: All updates complete, but we are still writing to disk
342 * COMMIT: All data on disk, writing commit record
343 * FINISHED: We still have to keep the transaction for checkpointing.
344 *
345 * The transaction keeps track of all of the buffers modified by a
346 * running transaction, and all of the buffers committed but not yet
347 * flushed to home for finished transactions.
348 */
349
350/*
351 * Lock ranking:
352 *
353 * j_list_lock
354 * ->jbd_lock_bh_journal_head() (This is "innermost")
355 *
356 * j_state_lock
357 * ->jbd_lock_bh_state()
358 *
359 * jbd_lock_bh_state()
360 * ->j_list_lock
361 *
362 * j_state_lock
363 * ->t_handle_lock
364 *
365 * j_state_lock
366 * ->j_list_lock (journal_unmap_buffer)
367 *
368 */
369
370struct transaction_s
371{
372 /* Pointer to the journal for this transaction. [no locking] */
373 journal_t *t_journal;
374
375 /* Sequence number for this transaction [no locking] */
376 tid_t t_tid;
377
378 /*
379 * Transaction's current state
380 * [no locking - only kjournald alters this]
381 * [j_list_lock] guards transition of a transaction into T_FINISHED
382 * state and subsequent call of __journal_drop_transaction()
383 * FIXME: needs barriers
384 * KLUDGE: [use j_state_lock]
385 */
386 enum {
387 T_RUNNING,
388 T_LOCKED,
389 T_FLUSH,
390 T_COMMIT,
391 T_COMMIT_RECORD,
392 T_FINISHED
393 } t_state;
394
395 /*
396 * Where in the log does this transaction's commit start? [no locking]
397 */
398 unsigned int t_log_start;
399
400 /* Number of buffers on the t_buffers list [j_list_lock] */
401 int t_nr_buffers;
402
403 /*
404 * Doubly-linked circular list of all buffers reserved but not yet
405 * modified by this transaction [j_list_lock]
406 */
407 struct journal_head *t_reserved_list;
408
409 /*
410 * Doubly-linked circular list of all buffers under writeout during
411 * commit [j_list_lock]
412 */
413 struct journal_head *t_locked_list;
414
415 /*
416 * Doubly-linked circular list of all metadata buffers owned by this
417 * transaction [j_list_lock]
418 */
419 struct journal_head *t_buffers;
420
421 /*
422 * Doubly-linked circular list of all data buffers still to be
423 * flushed before this transaction can be committed [j_list_lock]
424 */
425 struct journal_head *t_sync_datalist;
426
427 /*
428 * Doubly-linked circular list of all forget buffers (superseded
429 * buffers which we can un-checkpoint once this transaction commits)
430 * [j_list_lock]
431 */
432 struct journal_head *t_forget;
433
434 /*
435 * Doubly-linked circular list of all buffers still to be flushed before
436 * this transaction can be checkpointed. [j_list_lock]
437 */
438 struct journal_head *t_checkpoint_list;
439
440 /*
441 * Doubly-linked circular list of all buffers submitted for IO while
442 * checkpointing. [j_list_lock]
443 */
444 struct journal_head *t_checkpoint_io_list;
445
446 /*
447 * Doubly-linked circular list of temporary buffers currently undergoing
448 * IO in the log [j_list_lock]
449 */
450 struct journal_head *t_iobuf_list;
451
452 /*
453 * Doubly-linked circular list of metadata buffers being shadowed by log
454 * IO. The IO buffers on the iobuf list and the shadow buffers on this
455 * list match each other one for one at all times. [j_list_lock]
456 */
457 struct journal_head *t_shadow_list;
458
459 /*
460 * Doubly-linked circular list of control buffers being written to the
461 * log. [j_list_lock]
462 */
463 struct journal_head *t_log_list;
464
465 /*
466 * Protects info related to handles
467 */
468 spinlock_t t_handle_lock;
469
470 /*
471 * Number of outstanding updates running on this transaction
472 * [t_handle_lock]
473 */
474 int t_updates;
475
476 /*
477 * Number of buffers reserved for use by all handles in this transaction
478 * handle but not yet modified. [t_handle_lock]
479 */
480 int t_outstanding_credits;
481
482 /*
483 * Forward and backward links for the circular list of all transactions
484 * awaiting checkpoint. [j_list_lock]
485 */
486 transaction_t *t_cpnext, *t_cpprev;
487
488 /*
489 * When will the transaction expire (become due for commit), in jiffies?
490 * [no locking]
491 */
492 unsigned long t_expires;
493
494 /*
495 * When this transaction started, in nanoseconds [no locking]
496 */
497 ktime_t t_start_time;
498
499 /*
500 * How many handles used this transaction? [t_handle_lock]
501 */
502 int t_handle_count;
503};
504
505/**
506 * struct journal_s - this is the concrete type associated with journal_t.
507 * @j_flags: General journaling state flags
508 * @j_errno: Is there an outstanding uncleared error on the journal (from a
509 * prior abort)?
510 * @j_sb_buffer: First part of superblock buffer
511 * @j_superblock: Second part of superblock buffer
512 * @j_format_version: Version of the superblock format
513 * @j_state_lock: Protect the various scalars in the journal
514 * @j_barrier_count: Number of processes waiting to create a barrier lock
515 * @j_running_transaction: The current running transaction..
516 * @j_committing_transaction: the transaction we are pushing to disk
517 * @j_checkpoint_transactions: a linked circular list of all transactions
518 * waiting for checkpointing
519 * @j_wait_transaction_locked: Wait queue for waiting for a locked transaction
520 * to start committing, or for a barrier lock to be released
521 * @j_wait_logspace: Wait queue for waiting for checkpointing to complete
522 * @j_wait_done_commit: Wait queue for waiting for commit to complete
523 * @j_wait_checkpoint: Wait queue to trigger checkpointing
524 * @j_wait_commit: Wait queue to trigger commit
525 * @j_wait_updates: Wait queue to wait for updates to complete
526 * @j_checkpoint_mutex: Mutex for locking against concurrent checkpoints
527 * @j_head: Journal head - identifies the first unused block in the journal
528 * @j_tail: Journal tail - identifies the oldest still-used block in the
529 * journal.
530 * @j_free: Journal free - how many free blocks are there in the journal?
531 * @j_first: The block number of the first usable block
532 * @j_last: The block number one beyond the last usable block
533 * @j_dev: Device where we store the journal
534 * @j_blocksize: blocksize for the location where we store the journal.
535 * @j_blk_offset: starting block offset for into the device where we store the
536 * journal
537 * @j_fs_dev: Device which holds the client fs. For internal journal this will
538 * be equal to j_dev
539 * @j_maxlen: Total maximum capacity of the journal region on disk.
540 * @j_list_lock: Protects the buffer lists and internal buffer state.
541 * @j_inode: Optional inode where we store the journal. If present, all journal
542 * block numbers are mapped into this inode via bmap().
543 * @j_tail_sequence: Sequence number of the oldest transaction in the log
544 * @j_transaction_sequence: Sequence number of the next transaction to grant
545 * @j_commit_sequence: Sequence number of the most recently committed
546 * transaction
547 * @j_commit_request: Sequence number of the most recent transaction wanting
548 * commit
549 * @j_commit_waited: Sequence number of the most recent transaction someone
550 * is waiting for to commit.
551 * @j_uuid: Uuid of client object.
552 * @j_task: Pointer to the current commit thread for this journal
553 * @j_max_transaction_buffers: Maximum number of metadata buffers to allow in a
554 * single compound commit transaction
555 * @j_commit_interval: What is the maximum transaction lifetime before we begin
556 * a commit?
557 * @j_commit_timer: The timer used to wakeup the commit thread
558 * @j_revoke_lock: Protect the revoke table
559 * @j_revoke: The revoke table - maintains the list of revoked blocks in the
560 * current transaction.
561 * @j_revoke_table: alternate revoke tables for j_revoke
562 * @j_wbuf: array of buffer_heads for journal_commit_transaction
563 * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the
564 * number that will fit in j_blocksize
565 * @j_last_sync_writer: most recent pid which did a synchronous write
566 * @j_average_commit_time: the average amount of time in nanoseconds it
567 * takes to commit a transaction to the disk.
568 * @j_private: An opaque pointer to fs-private information.
569 */
570
571struct journal_s
572{
573 /* General journaling state flags [j_state_lock] */
574 unsigned long j_flags;
575
576 /*
577 * Is there an outstanding uncleared error on the journal (from a prior
578 * abort)? [j_state_lock]
579 */
580 int j_errno;
581
582 /* The superblock buffer */
583 struct buffer_head *j_sb_buffer;
584 journal_superblock_t *j_superblock;
585
586 /* Version of the superblock format */
587 int j_format_version;
588
589 /*
590 * Protect the various scalars in the journal
591 */
592 spinlock_t j_state_lock;
593
594 /*
595 * Number of processes waiting to create a barrier lock [j_state_lock]
596 */
597 int j_barrier_count;
598
599 /*
600 * Transactions: The current running transaction...
601 * [j_state_lock] [caller holding open handle]
602 */
603 transaction_t *j_running_transaction;
604
605 /*
606 * the transaction we are pushing to disk
607 * [j_state_lock] [caller holding open handle]
608 */
609 transaction_t *j_committing_transaction;
610
611 /*
612 * ... and a linked circular list of all transactions waiting for
613 * checkpointing. [j_list_lock]
614 */
615 transaction_t *j_checkpoint_transactions;
616
617 /*
618 * Wait queue for waiting for a locked transaction to start committing,
619 * or for a barrier lock to be released
620 */
621 wait_queue_head_t j_wait_transaction_locked;
622
623 /* Wait queue for waiting for checkpointing to complete */
624 wait_queue_head_t j_wait_logspace;
625
626 /* Wait queue for waiting for commit to complete */
627 wait_queue_head_t j_wait_done_commit;
628
629 /* Wait queue to trigger checkpointing */
630 wait_queue_head_t j_wait_checkpoint;
631
632 /* Wait queue to trigger commit */
633 wait_queue_head_t j_wait_commit;
634
635 /* Wait queue to wait for updates to complete */
636 wait_queue_head_t j_wait_updates;
637
638 /* Semaphore for locking against concurrent checkpoints */
639 struct mutex j_checkpoint_mutex;
640
641 /*
642 * Journal head: identifies the first unused block in the journal.
643 * [j_state_lock]
644 */
645 unsigned int j_head;
646
647 /*
648 * Journal tail: identifies the oldest still-used block in the journal.
649 * [j_state_lock]
650 */
651 unsigned int j_tail;
652
653 /*
654 * Journal free: how many free blocks are there in the journal?
655 * [j_state_lock]
656 */
657 unsigned int j_free;
658
659 /*
660 * Journal start and end: the block numbers of the first usable block
661 * and one beyond the last usable block in the journal. [j_state_lock]
662 */
663 unsigned int j_first;
664 unsigned int j_last;
665
666 /*
667 * Device, blocksize and starting block offset for the location where we
668 * store the journal.
669 */
670 struct block_device *j_dev;
671 int j_blocksize;
672 unsigned int j_blk_offset;
673
674 /*
675 * Device which holds the client fs. For internal journal this will be
676 * equal to j_dev.
677 */
678 struct block_device *j_fs_dev;
679
680 /* Total maximum capacity of the journal region on disk. */
681 unsigned int j_maxlen;
682
683 /*
684 * Protects the buffer lists and internal buffer state.
685 */
686 spinlock_t j_list_lock;
687
688 /* Optional inode where we store the journal. If present, all */
689 /* journal block numbers are mapped into this inode via */
690 /* bmap(). */
691 struct inode *j_inode;
692
693 /*
694 * Sequence number of the oldest transaction in the log [j_state_lock]
695 */
696 tid_t j_tail_sequence;
697
698 /*
699 * Sequence number of the next transaction to grant [j_state_lock]
700 */
701 tid_t j_transaction_sequence;
702
703 /*
704 * Sequence number of the most recently committed transaction
705 * [j_state_lock].
706 */
707 tid_t j_commit_sequence;
708
709 /*
710 * Sequence number of the most recent transaction wanting commit
711 * [j_state_lock]
712 */
713 tid_t j_commit_request;
714
715 /*
716 * Sequence number of the most recent transaction someone is waiting
717 * for to commit.
718 * [j_state_lock]
719 */
720 tid_t j_commit_waited;
721
722 /*
723 * Journal uuid: identifies the object (filesystem, LVM volume etc)
724 * backed by this journal. This will eventually be replaced by an array
725 * of uuids, allowing us to index multiple devices within a single
726 * journal and to perform atomic updates across them.
727 */
728 __u8 j_uuid[16];
729
730 /* Pointer to the current commit thread for this journal */
731 struct task_struct *j_task;
732
733 /*
734 * Maximum number of metadata buffers to allow in a single compound
735 * commit transaction
736 */
737 int j_max_transaction_buffers;
738
739 /*
740 * What is the maximum transaction lifetime before we begin a commit?
741 */
742 unsigned long j_commit_interval;
743
744 /* The timer used to wakeup the commit thread: */
745 struct timer_list j_commit_timer;
746
747 /*
748 * The revoke table: maintains the list of revoked blocks in the
749 * current transaction. [j_revoke_lock]
750 */
751 spinlock_t j_revoke_lock;
752 struct jbd_revoke_table_s *j_revoke;
753 struct jbd_revoke_table_s *j_revoke_table[2];
754
755 /*
756 * array of bhs for journal_commit_transaction
757 */
758 struct buffer_head **j_wbuf;
759 int j_wbufsize;
760
761 /*
762 * this is the pid of the last person to run a synchronous operation
763 * through the journal.
764 */
765 pid_t j_last_sync_writer;
766
767 /*
768 * the average amount of time in nanoseconds it takes to commit a
769 * transaction to the disk. [j_state_lock]
770 */
771 u64 j_average_commit_time;
772
773 /*
774 * An opaque pointer to fs-private information. ext3 puts its
775 * superblock pointer here
776 */
777 void *j_private;
778};
779
780/*
781 * Journal flag definitions
782 */
783#define JFS_UNMOUNT 0x001 /* Journal thread is being destroyed */
784#define JFS_ABORT 0x002 /* Journaling has been aborted for errors. */
785#define JFS_ACK_ERR 0x004 /* The errno in the sb has been acked */
786#define JFS_FLUSHED 0x008 /* The journal superblock has been flushed */
787#define JFS_LOADED 0x010 /* The journal superblock has been loaded */
788#define JFS_BARRIER 0x020 /* Use IDE barriers */
789#define JFS_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file
790 * data write error in ordered
791 * mode */
792
793/*
794 * Function declarations for the journaling transaction and buffer
795 * management
796 */
797
798/* Filing buffers */
799extern void journal_unfile_buffer(journal_t *, struct journal_head *);
800extern void __journal_unfile_buffer(struct journal_head *);
801extern void __journal_refile_buffer(struct journal_head *);
802extern void journal_refile_buffer(journal_t *, struct journal_head *);
803extern void __journal_file_buffer(struct journal_head *, transaction_t *, int);
804extern void __journal_free_buffer(struct journal_head *bh);
805extern void journal_file_buffer(struct journal_head *, transaction_t *, int);
806extern void __journal_clean_data_list(transaction_t *transaction);
807
808/* Log buffer allocation */
809extern struct journal_head * journal_get_descriptor_buffer(journal_t *);
810int journal_next_log_block(journal_t *, unsigned int *);
811
812/* Commit management */
813extern void journal_commit_transaction(journal_t *);
814
815/* Checkpoint list management */
816int __journal_clean_checkpoint_list(journal_t *journal);
817int __journal_remove_checkpoint(struct journal_head *);
818void __journal_insert_checkpoint(struct journal_head *, transaction_t *);
819
820/* Buffer IO */
821extern int
822journal_write_metadata_buffer(transaction_t *transaction,
823 struct journal_head *jh_in,
824 struct journal_head **jh_out,
825 unsigned int blocknr);
826
827/* Transaction locking */
828extern void __wait_on_journal (journal_t *);
829
830/*
831 * Journal locking.
832 *
833 * We need to lock the journal during transaction state changes so that nobody
834 * ever tries to take a handle on the running transaction while we are in the
835 * middle of moving it to the commit phase. j_state_lock does this.
836 *
837 * Note that the locking is completely interrupt unsafe. We never touch
838 * journal structures from interrupts.
839 */
840
841static inline handle_t *journal_current_handle(void)
842{
843 return current->journal_info;
844}
845
846/* The journaling code user interface:
847 *
848 * Create and destroy handles
849 * Register buffer modifications against the current transaction.
850 */
851
852extern handle_t *journal_start(journal_t *, int nblocks);
853extern int journal_restart (handle_t *, int nblocks);
854extern int journal_extend (handle_t *, int nblocks);
855extern int journal_get_write_access(handle_t *, struct buffer_head *);
856extern int journal_get_create_access (handle_t *, struct buffer_head *);
857extern int journal_get_undo_access(handle_t *, struct buffer_head *);
858extern int journal_dirty_data (handle_t *, struct buffer_head *);
859extern int journal_dirty_metadata (handle_t *, struct buffer_head *);
860extern void journal_release_buffer (handle_t *, struct buffer_head *);
861extern int journal_forget (handle_t *, struct buffer_head *);
862extern void journal_sync_buffer (struct buffer_head *);
863extern void journal_invalidatepage(journal_t *,
864 struct page *, unsigned int, unsigned int);
865extern int journal_try_to_free_buffers(journal_t *, struct page *, gfp_t);
866extern int journal_stop(handle_t *);
867extern int journal_flush (journal_t *);
868extern void journal_lock_updates (journal_t *);
869extern void journal_unlock_updates (journal_t *);
870
871extern journal_t * journal_init_dev(struct block_device *bdev,
872 struct block_device *fs_dev,
873 int start, int len, int bsize);
874extern journal_t * journal_init_inode (struct inode *);
875extern int journal_update_format (journal_t *);
876extern int journal_check_used_features
877 (journal_t *, unsigned long, unsigned long, unsigned long);
878extern int journal_check_available_features
879 (journal_t *, unsigned long, unsigned long, unsigned long);
880extern int journal_set_features
881 (journal_t *, unsigned long, unsigned long, unsigned long);
882extern int journal_create (journal_t *);
883extern int journal_load (journal_t *journal);
884extern int journal_destroy (journal_t *);
885extern int journal_recover (journal_t *journal);
886extern int journal_wipe (journal_t *, int);
887extern int journal_skip_recovery (journal_t *);
888extern void journal_update_sb_log_tail (journal_t *, tid_t, unsigned int,
889 int);
890extern void journal_abort (journal_t *, int);
891extern int journal_errno (journal_t *);
892extern void journal_ack_err (journal_t *);
893extern int journal_clear_err (journal_t *);
894extern int journal_bmap(journal_t *, unsigned int, unsigned int *);
895extern int journal_force_commit(journal_t *);
896
897/*
898 * journal_head management
899 */
900struct journal_head *journal_add_journal_head(struct buffer_head *bh);
901struct journal_head *journal_grab_journal_head(struct buffer_head *bh);
902void journal_put_journal_head(struct journal_head *jh);
903
904/*
905 * handle management
906 */
907extern struct kmem_cache *jbd_handle_cache;
908
909static inline handle_t *jbd_alloc_handle(gfp_t gfp_flags)
910{
911 return kmem_cache_zalloc(jbd_handle_cache, gfp_flags);
912}
913
914static inline void jbd_free_handle(handle_t *handle)
915{
916 kmem_cache_free(jbd_handle_cache, handle);
917}
918
919/* Primary revoke support */
920#define JOURNAL_REVOKE_DEFAULT_HASH 256
921extern int journal_init_revoke(journal_t *, int);
922extern void journal_destroy_revoke_caches(void);
923extern int journal_init_revoke_caches(void);
924
925extern void journal_destroy_revoke(journal_t *);
926extern int journal_revoke (handle_t *,
927 unsigned int, struct buffer_head *);
928extern int journal_cancel_revoke(handle_t *, struct journal_head *);
929extern void journal_write_revoke_records(journal_t *,
930 transaction_t *, int);
931
932/* Recovery revoke support */
933extern int journal_set_revoke(journal_t *, unsigned int, tid_t);
934extern int journal_test_revoke(journal_t *, unsigned int, tid_t);
935extern void journal_clear_revoke(journal_t *);
936extern void journal_switch_revoke_table(journal_t *journal);
937extern void journal_clear_buffer_revoked_flags(journal_t *journal);
938
939/*
940 * The log thread user interface:
941 *
942 * Request space in the current transaction, and force transaction commit
943 * transitions on demand.
944 */
945
946int __log_space_left(journal_t *); /* Called with journal locked */
947int log_start_commit(journal_t *journal, tid_t tid);
948int __log_start_commit(journal_t *journal, tid_t tid);
949int journal_start_commit(journal_t *journal, tid_t *tid);
950int journal_force_commit_nested(journal_t *journal);
951int log_wait_commit(journal_t *journal, tid_t tid);
952int log_do_checkpoint(journal_t *journal);
953int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
954
955void __log_wait_for_space(journal_t *journal);
956extern void __journal_drop_transaction(journal_t *, transaction_t *);
957extern int cleanup_journal_tail(journal_t *);
958
959/*
960 * is_journal_abort
961 *
962 * Simple test wrapper function to test the JFS_ABORT state flag. This
963 * bit, when set, indicates that we have had a fatal error somewhere,
964 * either inside the journaling layer or indicated to us by the client
965 * (eg. ext3), and that we and should not commit any further
966 * transactions.
967 */
968
969static inline int is_journal_aborted(journal_t *journal)
970{
971 return journal->j_flags & JFS_ABORT;
972}
973
974static inline int is_handle_aborted(handle_t *handle)
975{
976 if (handle->h_aborted)
977 return 1;
978 return is_journal_aborted(handle->h_transaction->t_journal);
979}
980
981static inline void journal_abort_handle(handle_t *handle)
982{
983 handle->h_aborted = 1;
984}
985
986#endif /* __KERNEL__ */
987
988/* Comparison functions for transaction IDs: perform comparisons using
989 * modulo arithmetic so that they work over sequence number wraps. */
990
991static inline int tid_gt(tid_t x, tid_t y)
992{
993 int difference = (x - y);
994 return (difference > 0);
995}
996
997static inline int tid_geq(tid_t x, tid_t y)
998{
999 int difference = (x - y);
1000 return (difference >= 0);
1001}
1002
1003extern int journal_blocks_per_page(struct inode *inode);
1004
1005/*
1006 * Return the minimum number of blocks which must be free in the journal
1007 * before a new transaction may be started. Must be called under j_state_lock.
1008 */
1009static inline int jbd_space_needed(journal_t *journal)
1010{
1011 int nblocks = journal->j_max_transaction_buffers;
1012 if (journal->j_committing_transaction)
1013 nblocks += journal->j_committing_transaction->
1014 t_outstanding_credits;
1015 return nblocks;
1016}
1017
1018/*
1019 * Definitions which augment the buffer_head layer
1020 */
1021
1022/* journaling buffer types */
1023#define BJ_None 0 /* Not journaled */
1024#define BJ_SyncData 1 /* Normal data: flush before commit */
1025#define BJ_Metadata 2 /* Normal journaled metadata */
1026#define BJ_Forget 3 /* Buffer superseded by this transaction */
1027#define BJ_IO 4 /* Buffer is for temporary IO use */
1028#define BJ_Shadow 5 /* Buffer contents being shadowed to the log */
1029#define BJ_LogCtl 6 /* Buffer contains log descriptors */
1030#define BJ_Reserved 7 /* Buffer is reserved for access by journal */
1031#define BJ_Locked 8 /* Locked for I/O during commit */
1032#define BJ_Types 9
1033
1034extern int jbd_blocks_per_page(struct inode *inode);
1035
1036#ifdef __KERNEL__
1037
1038#define buffer_trace_init(bh) do {} while (0)
1039#define print_buffer_fields(bh) do {} while (0)
1040#define print_buffer_trace(bh) do {} while (0)
1041#define BUFFER_TRACE(bh, info) do {} while (0)
1042#define BUFFER_TRACE2(bh, bh2, info) do {} while (0)
1043#define JBUFFER_TRACE(jh, info) do {} while (0)
1044
1045#endif /* __KERNEL__ */
1046
1047#endif /* _LINUX_JBD_H */
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index edb640ae9a94..ad4b28647298 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -29,6 +29,7 @@
29#include <linux/mutex.h> 29#include <linux/mutex.h>
30#include <linux/timer.h> 30#include <linux/timer.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/bit_spinlock.h>
32#include <crypto/hash.h> 33#include <crypto/hash.h>
33#endif 34#endif
34 35
@@ -336,7 +337,45 @@ BUFFER_FNS(Freed, freed)
336BUFFER_FNS(Shadow, shadow) 337BUFFER_FNS(Shadow, shadow)
337BUFFER_FNS(Verified, verified) 338BUFFER_FNS(Verified, verified)
338 339
339#include <linux/jbd_common.h> 340static inline struct buffer_head *jh2bh(struct journal_head *jh)
341{
342 return jh->b_bh;
343}
344
345static inline struct journal_head *bh2jh(struct buffer_head *bh)
346{
347 return bh->b_private;
348}
349
350static inline void jbd_lock_bh_state(struct buffer_head *bh)
351{
352 bit_spin_lock(BH_State, &bh->b_state);
353}
354
355static inline int jbd_trylock_bh_state(struct buffer_head *bh)
356{
357 return bit_spin_trylock(BH_State, &bh->b_state);
358}
359
360static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
361{
362 return bit_spin_is_locked(BH_State, &bh->b_state);
363}
364
365static inline void jbd_unlock_bh_state(struct buffer_head *bh)
366{
367 bit_spin_unlock(BH_State, &bh->b_state);
368}
369
370static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
371{
372 bit_spin_lock(BH_JournalHead, &bh->b_state);
373}
374
375static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
376{
377 bit_spin_unlock(BH_JournalHead, &bh->b_state);
378}
340 379
341#define J_ASSERT(assert) BUG_ON(!(assert)) 380#define J_ASSERT(assert) BUG_ON(!(assert))
342 381
diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h
deleted file mode 100644
index 3dc53432355f..000000000000
--- a/include/linux/jbd_common.h
+++ /dev/null
@@ -1,46 +0,0 @@
1#ifndef _LINUX_JBD_STATE_H
2#define _LINUX_JBD_STATE_H
3
4#include <linux/bit_spinlock.h>
5
6static inline struct buffer_head *jh2bh(struct journal_head *jh)
7{
8 return jh->b_bh;
9}
10
11static inline struct journal_head *bh2jh(struct buffer_head *bh)
12{
13 return bh->b_private;
14}
15
16static inline void jbd_lock_bh_state(struct buffer_head *bh)
17{
18 bit_spin_lock(BH_State, &bh->b_state);
19}
20
21static inline int jbd_trylock_bh_state(struct buffer_head *bh)
22{
23 return bit_spin_trylock(BH_State, &bh->b_state);
24}
25
26static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
27{
28 return bit_spin_is_locked(BH_State, &bh->b_state);
29}
30
31static inline void jbd_unlock_bh_state(struct buffer_head *bh)
32{
33 bit_spin_unlock(BH_State, &bh->b_state);
34}
35
36static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
37{
38 bit_spin_lock(BH_JournalHead, &bh->b_state);
39}
40
41static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
42{
43 bit_spin_unlock(BH_JournalHead, &bh->b_state);
44}
45
46#endif
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 77ca6601ff25..7a57c28eb5e7 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -43,7 +43,7 @@ void inode_claim_rsv_space(struct inode *inode, qsize_t number);
43void inode_sub_rsv_space(struct inode *inode, qsize_t number); 43void inode_sub_rsv_space(struct inode *inode, qsize_t number);
44void inode_reclaim_rsv_space(struct inode *inode, qsize_t number); 44void inode_reclaim_rsv_space(struct inode *inode, qsize_t number);
45 45
46void dquot_initialize(struct inode *inode); 46int dquot_initialize(struct inode *inode);
47void dquot_drop(struct inode *inode); 47void dquot_drop(struct inode *inode);
48struct dquot *dqget(struct super_block *sb, struct kqid qid); 48struct dquot *dqget(struct super_block *sb, struct kqid qid);
49static inline struct dquot *dqgrab(struct dquot *dquot) 49static inline struct dquot *dqgrab(struct dquot *dquot)
@@ -200,8 +200,9 @@ static inline int sb_has_quota_active(struct super_block *sb, int type)
200 return 0; 200 return 0;
201} 201}
202 202
203static inline void dquot_initialize(struct inode *inode) 203static inline int dquot_initialize(struct inode *inode)
204{ 204{
205 return 0;
205} 206}
206 207
207static inline void dquot_drop(struct inode *inode) 208static inline void dquot_drop(struct inode *inode)
diff --git a/include/trace/events/ext3.h b/include/trace/events/ext3.h
deleted file mode 100644
index fc733d28117a..000000000000
--- a/include/trace/events/ext3.h
+++ /dev/null
@@ -1,866 +0,0 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM ext3
3
4#if !defined(_TRACE_EXT3_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_EXT3_H
6
7#include <linux/tracepoint.h>
8
9TRACE_EVENT(ext3_free_inode,
10 TP_PROTO(struct inode *inode),
11
12 TP_ARGS(inode),
13
14 TP_STRUCT__entry(
15 __field( dev_t, dev )
16 __field( ino_t, ino )
17 __field( umode_t, mode )
18 __field( uid_t, uid )
19 __field( gid_t, gid )
20 __field( blkcnt_t, blocks )
21 ),
22
23 TP_fast_assign(
24 __entry->dev = inode->i_sb->s_dev;
25 __entry->ino = inode->i_ino;
26 __entry->mode = inode->i_mode;
27 __entry->uid = i_uid_read(inode);
28 __entry->gid = i_gid_read(inode);
29 __entry->blocks = inode->i_blocks;
30 ),
31
32 TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %lu",
33 MAJOR(__entry->dev), MINOR(__entry->dev),
34 (unsigned long) __entry->ino,
35 __entry->mode, __entry->uid, __entry->gid,
36 (unsigned long) __entry->blocks)
37);
38
39TRACE_EVENT(ext3_request_inode,
40 TP_PROTO(struct inode *dir, int mode),
41
42 TP_ARGS(dir, mode),
43
44 TP_STRUCT__entry(
45 __field( dev_t, dev )
46 __field( ino_t, dir )
47 __field( umode_t, mode )
48 ),
49
50 TP_fast_assign(
51 __entry->dev = dir->i_sb->s_dev;
52 __entry->dir = dir->i_ino;
53 __entry->mode = mode;
54 ),
55
56 TP_printk("dev %d,%d dir %lu mode 0%o",
57 MAJOR(__entry->dev), MINOR(__entry->dev),
58 (unsigned long) __entry->dir, __entry->mode)
59);
60
61TRACE_EVENT(ext3_allocate_inode,
62 TP_PROTO(struct inode *inode, struct inode *dir, int mode),
63
64 TP_ARGS(inode, dir, mode),
65
66 TP_STRUCT__entry(
67 __field( dev_t, dev )
68 __field( ino_t, ino )
69 __field( ino_t, dir )
70 __field( umode_t, mode )
71 ),
72
73 TP_fast_assign(
74 __entry->dev = inode->i_sb->s_dev;
75 __entry->ino = inode->i_ino;
76 __entry->dir = dir->i_ino;
77 __entry->mode = mode;
78 ),
79
80 TP_printk("dev %d,%d ino %lu dir %lu mode 0%o",
81 MAJOR(__entry->dev), MINOR(__entry->dev),
82 (unsigned long) __entry->ino,
83 (unsigned long) __entry->dir, __entry->mode)
84);
85
86TRACE_EVENT(ext3_evict_inode,
87 TP_PROTO(struct inode *inode),
88
89 TP_ARGS(inode),
90
91 TP_STRUCT__entry(
92 __field( dev_t, dev )
93 __field( ino_t, ino )
94 __field( int, nlink )
95 ),
96
97 TP_fast_assign(
98 __entry->dev = inode->i_sb->s_dev;
99 __entry->ino = inode->i_ino;
100 __entry->nlink = inode->i_nlink;
101 ),
102
103 TP_printk("dev %d,%d ino %lu nlink %d",
104 MAJOR(__entry->dev), MINOR(__entry->dev),
105 (unsigned long) __entry->ino, __entry->nlink)
106);
107
108TRACE_EVENT(ext3_drop_inode,
109 TP_PROTO(struct inode *inode, int drop),
110
111 TP_ARGS(inode, drop),
112
113 TP_STRUCT__entry(
114 __field( dev_t, dev )
115 __field( ino_t, ino )
116 __field( int, drop )
117 ),
118
119 TP_fast_assign(
120 __entry->dev = inode->i_sb->s_dev;
121 __entry->ino = inode->i_ino;
122 __entry->drop = drop;
123 ),
124
125 TP_printk("dev %d,%d ino %lu drop %d",
126 MAJOR(__entry->dev), MINOR(__entry->dev),
127 (unsigned long) __entry->ino, __entry->drop)
128);
129
130TRACE_EVENT(ext3_mark_inode_dirty,
131 TP_PROTO(struct inode *inode, unsigned long IP),
132
133 TP_ARGS(inode, IP),
134
135 TP_STRUCT__entry(
136 __field( dev_t, dev )
137 __field( ino_t, ino )
138 __field(unsigned long, ip )
139 ),
140
141 TP_fast_assign(
142 __entry->dev = inode->i_sb->s_dev;
143 __entry->ino = inode->i_ino;
144 __entry->ip = IP;
145 ),
146
147 TP_printk("dev %d,%d ino %lu caller %pS",
148 MAJOR(__entry->dev), MINOR(__entry->dev),
149 (unsigned long) __entry->ino, (void *)__entry->ip)
150);
151
152TRACE_EVENT(ext3_write_begin,
153 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
154 unsigned int flags),
155
156 TP_ARGS(inode, pos, len, flags),
157
158 TP_STRUCT__entry(
159 __field( dev_t, dev )
160 __field( ino_t, ino )
161 __field( loff_t, pos )
162 __field( unsigned int, len )
163 __field( unsigned int, flags )
164 ),
165
166 TP_fast_assign(
167 __entry->dev = inode->i_sb->s_dev;
168 __entry->ino = inode->i_ino;
169 __entry->pos = pos;
170 __entry->len = len;
171 __entry->flags = flags;
172 ),
173
174 TP_printk("dev %d,%d ino %lu pos %llu len %u flags %u",
175 MAJOR(__entry->dev), MINOR(__entry->dev),
176 (unsigned long) __entry->ino,
177 (unsigned long long) __entry->pos, __entry->len,
178 __entry->flags)
179);
180
181DECLARE_EVENT_CLASS(ext3__write_end,
182 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
183 unsigned int copied),
184
185 TP_ARGS(inode, pos, len, copied),
186
187 TP_STRUCT__entry(
188 __field( dev_t, dev )
189 __field( ino_t, ino )
190 __field( loff_t, pos )
191 __field( unsigned int, len )
192 __field( unsigned int, copied )
193 ),
194
195 TP_fast_assign(
196 __entry->dev = inode->i_sb->s_dev;
197 __entry->ino = inode->i_ino;
198 __entry->pos = pos;
199 __entry->len = len;
200 __entry->copied = copied;
201 ),
202
203 TP_printk("dev %d,%d ino %lu pos %llu len %u copied %u",
204 MAJOR(__entry->dev), MINOR(__entry->dev),
205 (unsigned long) __entry->ino,
206 (unsigned long long) __entry->pos, __entry->len,
207 __entry->copied)
208);
209
210DEFINE_EVENT(ext3__write_end, ext3_ordered_write_end,
211
212 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
213 unsigned int copied),
214
215 TP_ARGS(inode, pos, len, copied)
216);
217
218DEFINE_EVENT(ext3__write_end, ext3_writeback_write_end,
219
220 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
221 unsigned int copied),
222
223 TP_ARGS(inode, pos, len, copied)
224);
225
226DEFINE_EVENT(ext3__write_end, ext3_journalled_write_end,
227
228 TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
229 unsigned int copied),
230
231 TP_ARGS(inode, pos, len, copied)
232);
233
234DECLARE_EVENT_CLASS(ext3__page_op,
235 TP_PROTO(struct page *page),
236
237 TP_ARGS(page),
238
239 TP_STRUCT__entry(
240 __field( dev_t, dev )
241 __field( ino_t, ino )
242 __field( pgoff_t, index )
243
244 ),
245
246 TP_fast_assign(
247 __entry->index = page->index;
248 __entry->ino = page->mapping->host->i_ino;
249 __entry->dev = page->mapping->host->i_sb->s_dev;
250 ),
251
252 TP_printk("dev %d,%d ino %lu page_index %lu",
253 MAJOR(__entry->dev), MINOR(__entry->dev),
254 (unsigned long) __entry->ino, __entry->index)
255);
256
257DEFINE_EVENT(ext3__page_op, ext3_ordered_writepage,
258
259 TP_PROTO(struct page *page),
260
261 TP_ARGS(page)
262);
263
264DEFINE_EVENT(ext3__page_op, ext3_writeback_writepage,
265
266 TP_PROTO(struct page *page),
267
268 TP_ARGS(page)
269);
270
271DEFINE_EVENT(ext3__page_op, ext3_journalled_writepage,
272
273 TP_PROTO(struct page *page),
274
275 TP_ARGS(page)
276);
277
278DEFINE_EVENT(ext3__page_op, ext3_readpage,
279
280 TP_PROTO(struct page *page),
281
282 TP_ARGS(page)
283);
284
285DEFINE_EVENT(ext3__page_op, ext3_releasepage,
286
287 TP_PROTO(struct page *page),
288
289 TP_ARGS(page)
290);
291
292TRACE_EVENT(ext3_invalidatepage,
293 TP_PROTO(struct page *page, unsigned int offset, unsigned int length),
294
295 TP_ARGS(page, offset, length),
296
297 TP_STRUCT__entry(
298 __field( pgoff_t, index )
299 __field( unsigned int, offset )
300 __field( unsigned int, length )
301 __field( ino_t, ino )
302 __field( dev_t, dev )
303
304 ),
305
306 TP_fast_assign(
307 __entry->index = page->index;
308 __entry->offset = offset;
309 __entry->length = length;
310 __entry->ino = page->mapping->host->i_ino;
311 __entry->dev = page->mapping->host->i_sb->s_dev;
312 ),
313
314 TP_printk("dev %d,%d ino %lu page_index %lu offset %u length %u",
315 MAJOR(__entry->dev), MINOR(__entry->dev),
316 (unsigned long) __entry->ino,
317 __entry->index, __entry->offset, __entry->length)
318);
319
320TRACE_EVENT(ext3_discard_blocks,
321 TP_PROTO(struct super_block *sb, unsigned long blk,
322 unsigned long count),
323
324 TP_ARGS(sb, blk, count),
325
326 TP_STRUCT__entry(
327 __field( dev_t, dev )
328 __field( unsigned long, blk )
329 __field( unsigned long, count )
330
331 ),
332
333 TP_fast_assign(
334 __entry->dev = sb->s_dev;
335 __entry->blk = blk;
336 __entry->count = count;
337 ),
338
339 TP_printk("dev %d,%d blk %lu count %lu",
340 MAJOR(__entry->dev), MINOR(__entry->dev),
341 __entry->blk, __entry->count)
342);
343
344TRACE_EVENT(ext3_request_blocks,
345 TP_PROTO(struct inode *inode, unsigned long goal,
346 unsigned long count),
347
348 TP_ARGS(inode, goal, count),
349
350 TP_STRUCT__entry(
351 __field( dev_t, dev )
352 __field( ino_t, ino )
353 __field( unsigned long, count )
354 __field( unsigned long, goal )
355 ),
356
357 TP_fast_assign(
358 __entry->dev = inode->i_sb->s_dev;
359 __entry->ino = inode->i_ino;
360 __entry->count = count;
361 __entry->goal = goal;
362 ),
363
364 TP_printk("dev %d,%d ino %lu count %lu goal %lu ",
365 MAJOR(__entry->dev), MINOR(__entry->dev),
366 (unsigned long) __entry->ino,
367 __entry->count, __entry->goal)
368);
369
370TRACE_EVENT(ext3_allocate_blocks,
371 TP_PROTO(struct inode *inode, unsigned long goal,
372 unsigned long count, unsigned long block),
373
374 TP_ARGS(inode, goal, count, block),
375
376 TP_STRUCT__entry(
377 __field( dev_t, dev )
378 __field( ino_t, ino )
379 __field( unsigned long, block )
380 __field( unsigned long, count )
381 __field( unsigned long, goal )
382 ),
383
384 TP_fast_assign(
385 __entry->dev = inode->i_sb->s_dev;
386 __entry->ino = inode->i_ino;
387 __entry->block = block;
388 __entry->count = count;
389 __entry->goal = goal;
390 ),
391
392 TP_printk("dev %d,%d ino %lu count %lu block %lu goal %lu",
393 MAJOR(__entry->dev), MINOR(__entry->dev),
394 (unsigned long) __entry->ino,
395 __entry->count, __entry->block,
396 __entry->goal)
397);
398
399TRACE_EVENT(ext3_free_blocks,
400 TP_PROTO(struct inode *inode, unsigned long block,
401 unsigned long count),
402
403 TP_ARGS(inode, block, count),
404
405 TP_STRUCT__entry(
406 __field( dev_t, dev )
407 __field( ino_t, ino )
408 __field( umode_t, mode )
409 __field( unsigned long, block )
410 __field( unsigned long, count )
411 ),
412
413 TP_fast_assign(
414 __entry->dev = inode->i_sb->s_dev;
415 __entry->ino = inode->i_ino;
416 __entry->mode = inode->i_mode;
417 __entry->block = block;
418 __entry->count = count;
419 ),
420
421 TP_printk("dev %d,%d ino %lu mode 0%o block %lu count %lu",
422 MAJOR(__entry->dev), MINOR(__entry->dev),
423 (unsigned long) __entry->ino,
424 __entry->mode, __entry->block, __entry->count)
425);
426
427TRACE_EVENT(ext3_sync_file_enter,
428 TP_PROTO(struct file *file, int datasync),
429
430 TP_ARGS(file, datasync),
431
432 TP_STRUCT__entry(
433 __field( dev_t, dev )
434 __field( ino_t, ino )
435 __field( ino_t, parent )
436 __field( int, datasync )
437 ),
438
439 TP_fast_assign(
440 struct dentry *dentry = file->f_path.dentry;
441
442 __entry->dev = d_inode(dentry)->i_sb->s_dev;
443 __entry->ino = d_inode(dentry)->i_ino;
444 __entry->datasync = datasync;
445 __entry->parent = d_inode(dentry->d_parent)->i_ino;
446 ),
447
448 TP_printk("dev %d,%d ino %lu parent %ld datasync %d ",
449 MAJOR(__entry->dev), MINOR(__entry->dev),
450 (unsigned long) __entry->ino,
451 (unsigned long) __entry->parent, __entry->datasync)
452);
453
454TRACE_EVENT(ext3_sync_file_exit,
455 TP_PROTO(struct inode *inode, int ret),
456
457 TP_ARGS(inode, ret),
458
459 TP_STRUCT__entry(
460 __field( int, ret )
461 __field( ino_t, ino )
462 __field( dev_t, dev )
463 ),
464
465 TP_fast_assign(
466 __entry->ret = ret;
467 __entry->ino = inode->i_ino;
468 __entry->dev = inode->i_sb->s_dev;
469 ),
470
471 TP_printk("dev %d,%d ino %lu ret %d",
472 MAJOR(__entry->dev), MINOR(__entry->dev),
473 (unsigned long) __entry->ino,
474 __entry->ret)
475);
476
477TRACE_EVENT(ext3_sync_fs,
478 TP_PROTO(struct super_block *sb, int wait),
479
480 TP_ARGS(sb, wait),
481
482 TP_STRUCT__entry(
483 __field( dev_t, dev )
484 __field( int, wait )
485
486 ),
487
488 TP_fast_assign(
489 __entry->dev = sb->s_dev;
490 __entry->wait = wait;
491 ),
492
493 TP_printk("dev %d,%d wait %d",
494 MAJOR(__entry->dev), MINOR(__entry->dev),
495 __entry->wait)
496);
497
498TRACE_EVENT(ext3_rsv_window_add,
499 TP_PROTO(struct super_block *sb,
500 struct ext3_reserve_window_node *rsv_node),
501
502 TP_ARGS(sb, rsv_node),
503
504 TP_STRUCT__entry(
505 __field( unsigned long, start )
506 __field( unsigned long, end )
507 __field( dev_t, dev )
508 ),
509
510 TP_fast_assign(
511 __entry->dev = sb->s_dev;
512 __entry->start = rsv_node->rsv_window._rsv_start;
513 __entry->end = rsv_node->rsv_window._rsv_end;
514 ),
515
516 TP_printk("dev %d,%d start %lu end %lu",
517 MAJOR(__entry->dev), MINOR(__entry->dev),
518 __entry->start, __entry->end)
519);
520
521TRACE_EVENT(ext3_discard_reservation,
522 TP_PROTO(struct inode *inode,
523 struct ext3_reserve_window_node *rsv_node),
524
525 TP_ARGS(inode, rsv_node),
526
527 TP_STRUCT__entry(
528 __field( unsigned long, start )
529 __field( unsigned long, end )
530 __field( ino_t, ino )
531 __field( dev_t, dev )
532 ),
533
534 TP_fast_assign(
535 __entry->start = rsv_node->rsv_window._rsv_start;
536 __entry->end = rsv_node->rsv_window._rsv_end;
537 __entry->ino = inode->i_ino;
538 __entry->dev = inode->i_sb->s_dev;
539 ),
540
541 TP_printk("dev %d,%d ino %lu start %lu end %lu",
542 MAJOR(__entry->dev), MINOR(__entry->dev),
543 (unsigned long)__entry->ino, __entry->start,
544 __entry->end)
545);
546
547TRACE_EVENT(ext3_alloc_new_reservation,
548 TP_PROTO(struct super_block *sb, unsigned long goal),
549
550 TP_ARGS(sb, goal),
551
552 TP_STRUCT__entry(
553 __field( dev_t, dev )
554 __field( unsigned long, goal )
555 ),
556
557 TP_fast_assign(
558 __entry->dev = sb->s_dev;
559 __entry->goal = goal;
560 ),
561
562 TP_printk("dev %d,%d goal %lu",
563 MAJOR(__entry->dev), MINOR(__entry->dev),
564 __entry->goal)
565);
566
567TRACE_EVENT(ext3_reserved,
568 TP_PROTO(struct super_block *sb, unsigned long block,
569 struct ext3_reserve_window_node *rsv_node),
570
571 TP_ARGS(sb, block, rsv_node),
572
573 TP_STRUCT__entry(
574 __field( unsigned long, block )
575 __field( unsigned long, start )
576 __field( unsigned long, end )
577 __field( dev_t, dev )
578 ),
579
580 TP_fast_assign(
581 __entry->block = block;
582 __entry->start = rsv_node->rsv_window._rsv_start;
583 __entry->end = rsv_node->rsv_window._rsv_end;
584 __entry->dev = sb->s_dev;
585 ),
586
587 TP_printk("dev %d,%d block %lu, start %lu end %lu",
588 MAJOR(__entry->dev), MINOR(__entry->dev),
589 __entry->block, __entry->start, __entry->end)
590);
591
592TRACE_EVENT(ext3_forget,
593 TP_PROTO(struct inode *inode, int is_metadata, unsigned long block),
594
595 TP_ARGS(inode, is_metadata, block),
596
597 TP_STRUCT__entry(
598 __field( dev_t, dev )
599 __field( ino_t, ino )
600 __field( umode_t, mode )
601 __field( int, is_metadata )
602 __field( unsigned long, block )
603 ),
604
605 TP_fast_assign(
606 __entry->dev = inode->i_sb->s_dev;
607 __entry->ino = inode->i_ino;
608 __entry->mode = inode->i_mode;
609 __entry->is_metadata = is_metadata;
610 __entry->block = block;
611 ),
612
613 TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %lu",
614 MAJOR(__entry->dev), MINOR(__entry->dev),
615 (unsigned long) __entry->ino,
616 __entry->mode, __entry->is_metadata, __entry->block)
617);
618
619TRACE_EVENT(ext3_read_block_bitmap,
620 TP_PROTO(struct super_block *sb, unsigned int group),
621
622 TP_ARGS(sb, group),
623
624 TP_STRUCT__entry(
625 __field( dev_t, dev )
626 __field( __u32, group )
627
628 ),
629
630 TP_fast_assign(
631 __entry->dev = sb->s_dev;
632 __entry->group = group;
633 ),
634
635 TP_printk("dev %d,%d group %u",
636 MAJOR(__entry->dev), MINOR(__entry->dev),
637 __entry->group)
638);
639
640TRACE_EVENT(ext3_direct_IO_enter,
641 TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw),
642
643 TP_ARGS(inode, offset, len, rw),
644
645 TP_STRUCT__entry(
646 __field( ino_t, ino )
647 __field( dev_t, dev )
648 __field( loff_t, pos )
649 __field( unsigned long, len )
650 __field( int, rw )
651 ),
652
653 TP_fast_assign(
654 __entry->ino = inode->i_ino;
655 __entry->dev = inode->i_sb->s_dev;
656 __entry->pos = offset;
657 __entry->len = len;
658 __entry->rw = rw;
659 ),
660
661 TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d",
662 MAJOR(__entry->dev), MINOR(__entry->dev),
663 (unsigned long) __entry->ino,
664 (unsigned long long) __entry->pos, __entry->len,
665 __entry->rw)
666);
667
668TRACE_EVENT(ext3_direct_IO_exit,
669 TP_PROTO(struct inode *inode, loff_t offset, unsigned long len,
670 int rw, int ret),
671
672 TP_ARGS(inode, offset, len, rw, ret),
673
674 TP_STRUCT__entry(
675 __field( ino_t, ino )
676 __field( dev_t, dev )
677 __field( loff_t, pos )
678 __field( unsigned long, len )
679 __field( int, rw )
680 __field( int, ret )
681 ),
682
683 TP_fast_assign(
684 __entry->ino = inode->i_ino;
685 __entry->dev = inode->i_sb->s_dev;
686 __entry->pos = offset;
687 __entry->len = len;
688 __entry->rw = rw;
689 __entry->ret = ret;
690 ),
691
692 TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d ret %d",
693 MAJOR(__entry->dev), MINOR(__entry->dev),
694 (unsigned long) __entry->ino,
695 (unsigned long long) __entry->pos, __entry->len,
696 __entry->rw, __entry->ret)
697);
698
699TRACE_EVENT(ext3_unlink_enter,
700 TP_PROTO(struct inode *parent, struct dentry *dentry),
701
702 TP_ARGS(parent, dentry),
703
704 TP_STRUCT__entry(
705 __field( ino_t, parent )
706 __field( ino_t, ino )
707 __field( loff_t, size )
708 __field( dev_t, dev )
709 ),
710
711 TP_fast_assign(
712 __entry->parent = parent->i_ino;
713 __entry->ino = d_inode(dentry)->i_ino;
714 __entry->size = d_inode(dentry)->i_size;
715 __entry->dev = d_inode(dentry)->i_sb->s_dev;
716 ),
717
718 TP_printk("dev %d,%d ino %lu size %lld parent %ld",
719 MAJOR(__entry->dev), MINOR(__entry->dev),
720 (unsigned long) __entry->ino,
721 (unsigned long long)__entry->size,
722 (unsigned long) __entry->parent)
723);
724
725TRACE_EVENT(ext3_unlink_exit,
726 TP_PROTO(struct dentry *dentry, int ret),
727
728 TP_ARGS(dentry, ret),
729
730 TP_STRUCT__entry(
731 __field( ino_t, ino )
732 __field( dev_t, dev )
733 __field( int, ret )
734 ),
735
736 TP_fast_assign(
737 __entry->ino = d_inode(dentry)->i_ino;
738 __entry->dev = d_inode(dentry)->i_sb->s_dev;
739 __entry->ret = ret;
740 ),
741
742 TP_printk("dev %d,%d ino %lu ret %d",
743 MAJOR(__entry->dev), MINOR(__entry->dev),
744 (unsigned long) __entry->ino,
745 __entry->ret)
746);
747
748DECLARE_EVENT_CLASS(ext3__truncate,
749 TP_PROTO(struct inode *inode),
750
751 TP_ARGS(inode),
752
753 TP_STRUCT__entry(
754 __field( ino_t, ino )
755 __field( dev_t, dev )
756 __field( blkcnt_t, blocks )
757 ),
758
759 TP_fast_assign(
760 __entry->ino = inode->i_ino;
761 __entry->dev = inode->i_sb->s_dev;
762 __entry->blocks = inode->i_blocks;
763 ),
764
765 TP_printk("dev %d,%d ino %lu blocks %lu",
766 MAJOR(__entry->dev), MINOR(__entry->dev),
767 (unsigned long) __entry->ino, (unsigned long) __entry->blocks)
768);
769
770DEFINE_EVENT(ext3__truncate, ext3_truncate_enter,
771
772 TP_PROTO(struct inode *inode),
773
774 TP_ARGS(inode)
775);
776
777DEFINE_EVENT(ext3__truncate, ext3_truncate_exit,
778
779 TP_PROTO(struct inode *inode),
780
781 TP_ARGS(inode)
782);
783
784TRACE_EVENT(ext3_get_blocks_enter,
785 TP_PROTO(struct inode *inode, unsigned long lblk,
786 unsigned long len, int create),
787
788 TP_ARGS(inode, lblk, len, create),
789
790 TP_STRUCT__entry(
791 __field( ino_t, ino )
792 __field( dev_t, dev )
793 __field( unsigned long, lblk )
794 __field( unsigned long, len )
795 __field( int, create )
796 ),
797
798 TP_fast_assign(
799 __entry->ino = inode->i_ino;
800 __entry->dev = inode->i_sb->s_dev;
801 __entry->lblk = lblk;
802 __entry->len = len;
803 __entry->create = create;
804 ),
805
806 TP_printk("dev %d,%d ino %lu lblk %lu len %lu create %u",
807 MAJOR(__entry->dev), MINOR(__entry->dev),
808 (unsigned long) __entry->ino,
809 __entry->lblk, __entry->len, __entry->create)
810);
811
812TRACE_EVENT(ext3_get_blocks_exit,
813 TP_PROTO(struct inode *inode, unsigned long lblk,
814 unsigned long pblk, unsigned long len, int ret),
815
816 TP_ARGS(inode, lblk, pblk, len, ret),
817
818 TP_STRUCT__entry(
819 __field( ino_t, ino )
820 __field( dev_t, dev )
821 __field( unsigned long, lblk )
822 __field( unsigned long, pblk )
823 __field( unsigned long, len )
824 __field( int, ret )
825 ),
826
827 TP_fast_assign(
828 __entry->ino = inode->i_ino;
829 __entry->dev = inode->i_sb->s_dev;
830 __entry->lblk = lblk;
831 __entry->pblk = pblk;
832 __entry->len = len;
833 __entry->ret = ret;
834 ),
835
836 TP_printk("dev %d,%d ino %lu lblk %lu pblk %lu len %lu ret %d",
837 MAJOR(__entry->dev), MINOR(__entry->dev),
838 (unsigned long) __entry->ino,
839 __entry->lblk, __entry->pblk,
840 __entry->len, __entry->ret)
841);
842
843TRACE_EVENT(ext3_load_inode,
844 TP_PROTO(struct inode *inode),
845
846 TP_ARGS(inode),
847
848 TP_STRUCT__entry(
849 __field( ino_t, ino )
850 __field( dev_t, dev )
851 ),
852
853 TP_fast_assign(
854 __entry->ino = inode->i_ino;
855 __entry->dev = inode->i_sb->s_dev;
856 ),
857
858 TP_printk("dev %d,%d ino %lu",
859 MAJOR(__entry->dev), MINOR(__entry->dev),
860 (unsigned long) __entry->ino)
861);
862
863#endif /* _TRACE_EXT3_H */
864
865/* This part must be outside protection */
866#include <trace/define_trace.h>
diff --git a/include/trace/events/jbd.h b/include/trace/events/jbd.h
deleted file mode 100644
index da6f2591c25e..000000000000
--- a/include/trace/events/jbd.h
+++ /dev/null
@@ -1,194 +0,0 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM jbd
3
4#if !defined(_TRACE_JBD_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_JBD_H
6
7#include <linux/jbd.h>
8#include <linux/tracepoint.h>
9
10TRACE_EVENT(jbd_checkpoint,
11
12 TP_PROTO(journal_t *journal, int result),
13
14 TP_ARGS(journal, result),
15
16 TP_STRUCT__entry(
17 __field( dev_t, dev )
18 __field( int, result )
19 ),
20
21 TP_fast_assign(
22 __entry->dev = journal->j_fs_dev->bd_dev;
23 __entry->result = result;
24 ),
25
26 TP_printk("dev %d,%d result %d",
27 MAJOR(__entry->dev), MINOR(__entry->dev),
28 __entry->result)
29);
30
31DECLARE_EVENT_CLASS(jbd_commit,
32
33 TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
34
35 TP_ARGS(journal, commit_transaction),
36
37 TP_STRUCT__entry(
38 __field( dev_t, dev )
39 __field( int, transaction )
40 ),
41
42 TP_fast_assign(
43 __entry->dev = journal->j_fs_dev->bd_dev;
44 __entry->transaction = commit_transaction->t_tid;
45 ),
46
47 TP_printk("dev %d,%d transaction %d",
48 MAJOR(__entry->dev), MINOR(__entry->dev),
49 __entry->transaction)
50);
51
52DEFINE_EVENT(jbd_commit, jbd_start_commit,
53
54 TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
55
56 TP_ARGS(journal, commit_transaction)
57);
58
59DEFINE_EVENT(jbd_commit, jbd_commit_locking,
60
61 TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
62
63 TP_ARGS(journal, commit_transaction)
64);
65
66DEFINE_EVENT(jbd_commit, jbd_commit_flushing,
67
68 TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
69
70 TP_ARGS(journal, commit_transaction)
71);
72
73DEFINE_EVENT(jbd_commit, jbd_commit_logging,
74
75 TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
76
77 TP_ARGS(journal, commit_transaction)
78);
79
80TRACE_EVENT(jbd_drop_transaction,
81
82 TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
83
84 TP_ARGS(journal, commit_transaction),
85
86 TP_STRUCT__entry(
87 __field( dev_t, dev )
88 __field( int, transaction )
89 ),
90
91 TP_fast_assign(
92 __entry->dev = journal->j_fs_dev->bd_dev;
93 __entry->transaction = commit_transaction->t_tid;
94 ),
95
96 TP_printk("dev %d,%d transaction %d",
97 MAJOR(__entry->dev), MINOR(__entry->dev),
98 __entry->transaction)
99);
100
101TRACE_EVENT(jbd_end_commit,
102 TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
103
104 TP_ARGS(journal, commit_transaction),
105
106 TP_STRUCT__entry(
107 __field( dev_t, dev )
108 __field( int, transaction )
109 __field( int, head )
110 ),
111
112 TP_fast_assign(
113 __entry->dev = journal->j_fs_dev->bd_dev;
114 __entry->transaction = commit_transaction->t_tid;
115 __entry->head = journal->j_tail_sequence;
116 ),
117
118 TP_printk("dev %d,%d transaction %d head %d",
119 MAJOR(__entry->dev), MINOR(__entry->dev),
120 __entry->transaction, __entry->head)
121);
122
123TRACE_EVENT(jbd_do_submit_data,
124 TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
125
126 TP_ARGS(journal, commit_transaction),
127
128 TP_STRUCT__entry(
129 __field( dev_t, dev )
130 __field( int, transaction )
131 ),
132
133 TP_fast_assign(
134 __entry->dev = journal->j_fs_dev->bd_dev;
135 __entry->transaction = commit_transaction->t_tid;
136 ),
137
138 TP_printk("dev %d,%d transaction %d",
139 MAJOR(__entry->dev), MINOR(__entry->dev),
140 __entry->transaction)
141);
142
143TRACE_EVENT(jbd_cleanup_journal_tail,
144
145 TP_PROTO(journal_t *journal, tid_t first_tid,
146 unsigned long block_nr, unsigned long freed),
147
148 TP_ARGS(journal, first_tid, block_nr, freed),
149
150 TP_STRUCT__entry(
151 __field( dev_t, dev )
152 __field( tid_t, tail_sequence )
153 __field( tid_t, first_tid )
154 __field(unsigned long, block_nr )
155 __field(unsigned long, freed )
156 ),
157
158 TP_fast_assign(
159 __entry->dev = journal->j_fs_dev->bd_dev;
160 __entry->tail_sequence = journal->j_tail_sequence;
161 __entry->first_tid = first_tid;
162 __entry->block_nr = block_nr;
163 __entry->freed = freed;
164 ),
165
166 TP_printk("dev %d,%d from %u to %u offset %lu freed %lu",
167 MAJOR(__entry->dev), MINOR(__entry->dev),
168 __entry->tail_sequence, __entry->first_tid,
169 __entry->block_nr, __entry->freed)
170);
171
172TRACE_EVENT(journal_write_superblock,
173 TP_PROTO(journal_t *journal, int write_op),
174
175 TP_ARGS(journal, write_op),
176
177 TP_STRUCT__entry(
178 __field( dev_t, dev )
179 __field( int, write_op )
180 ),
181
182 TP_fast_assign(
183 __entry->dev = journal->j_fs_dev->bd_dev;
184 __entry->write_op = write_op;
185 ),
186
187 TP_printk("dev %d,%d write_op %x", MAJOR(__entry->dev),
188 MINOR(__entry->dev), __entry->write_op)
189);
190
191#endif /* _TRACE_JBD_H */
192
193/* This part must be outside protection */
194#include <trace/define_trace.h>
diff --git a/mm/Kconfig b/mm/Kconfig
index e79de2bd12cd..d4e6495a720f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -299,15 +299,9 @@ config BOUNCE
299# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often 299# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often
300# have more than 4GB of memory, but we don't currently use the IOTLB to present 300# have more than 4GB of memory, but we don't currently use the IOTLB to present
301# a 32-bit address to OHCI. So we need to use a bounce pool instead. 301# a 32-bit address to OHCI. So we need to use a bounce pool instead.
302#
303# We also use the bounce pool to provide stable page writes for jbd. jbd
304# initiates buffer writeback without locking the page or setting PG_writeback,
305# and fixing that behavior (a second time; jbd2 doesn't have this problem) is
306# a major rework effort. Instead, use the bounce buffer to snapshot pages
307# (until jbd goes away). The only jbd user is ext3.
308config NEED_BOUNCE_POOL 302config NEED_BOUNCE_POOL
309 bool 303 bool
310 default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD) 304 default y if TILE && USB_OHCI_HCD
311 305
312config NR_QUICK 306config NR_QUICK
313 int 307 int