aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ntfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ntfs')
-rw-r--r--fs/ntfs/ChangeLog1350
-rw-r--r--fs/ntfs/Makefile19
-rw-r--r--fs/ntfs/aops.c2324
-rw-r--r--fs/ntfs/aops.h109
-rw-r--r--fs/ntfs/attrib.c1258
-rw-r--r--fs/ntfs/attrib.h100
-rw-r--r--fs/ntfs/bitmap.c192
-rw-r--r--fs/ntfs/bitmap.h118
-rw-r--r--fs/ntfs/collate.c124
-rw-r--r--fs/ntfs/collate.h50
-rw-r--r--fs/ntfs/compress.c957
-rw-r--r--fs/ntfs/debug.c180
-rw-r--r--fs/ntfs/debug.h67
-rw-r--r--fs/ntfs/dir.c1569
-rw-r--r--fs/ntfs/dir.h48
-rw-r--r--fs/ntfs/endian.h93
-rw-r--r--fs/ntfs/file.c155
-rw-r--r--fs/ntfs/index.c461
-rw-r--r--fs/ntfs/index.h148
-rw-r--r--fs/ntfs/inode.c2616
-rw-r--r--fs/ntfs/inode.h321
-rw-r--r--fs/ntfs/layout.h2413
-rw-r--r--fs/ntfs/lcnalloc.c1002
-rw-r--r--fs/ntfs/lcnalloc.h112
-rw-r--r--fs/ntfs/logfile.c705
-rw-r--r--fs/ntfs/logfile.h307
-rw-r--r--fs/ntfs/malloc.h62
-rw-r--r--fs/ntfs/mft.c2829
-rw-r--r--fs/ntfs/mft.h127
-rw-r--r--fs/ntfs/mst.c203
-rw-r--r--fs/ntfs/namei.c498
-rw-r--r--fs/ntfs/ntfs.h129
-rw-r--r--fs/ntfs/quota.c117
-rw-r--r--fs/ntfs/quota.h35
-rw-r--r--fs/ntfs/runlist.c1438
-rw-r--r--fs/ntfs/runlist.h89
-rw-r--r--fs/ntfs/super.c2771
-rw-r--r--fs/ntfs/sysctl.c85
-rw-r--r--fs/ntfs/sysctl.h42
-rw-r--r--fs/ntfs/time.h100
-rw-r--r--fs/ntfs/types.h66
-rw-r--r--fs/ntfs/unistr.c384
-rw-r--r--fs/ntfs/upcase.c90
-rw-r--r--fs/ntfs/volume.h171
44 files changed, 26034 insertions, 0 deletions
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
new file mode 100644
index 000000000000..1d2ad15f1533
--- /dev/null
+++ b/fs/ntfs/ChangeLog
@@ -0,0 +1,1350 @@
1ToDo/Notes:
2 - Find and fix bugs.
3 - Checkpoint or disable the user space journal ($UsnJrnl).
4 - In between ntfs_prepare/commit_write, need exclusion between
5 simultaneous file extensions. Need perhaps an NInoResizeUnderway()
6 flag which we can set in ntfs_prepare_write() and clear again in
7 ntfs_commit_write(). Just have to be careful in readpage/writepage,
8 as well as in truncate, that we play nice... We might need to have
9 a data_size field in the ntfs_inode to store the real attribute
10 length. Also need to be careful with initialized_size extention in
11 ntfs_prepare_write. Basically, just be _very_ careful in this code...
12 OTOH, perhaps i_sem, which is held accross generic_file_write is
13 sufficient for synchronisation here. We then just need to make sure
14 ntfs_readpage/writepage/truncate interoperate properly with us.
15 UPDATE: The above is all ok as it is due to i_sem held. The only
16 thing that needs to be checked is ntfs_writepage() which does not
17 hold i_sem. It cannot change i_size but it needs to cope with a
18 concurrent i_size change.
19 - Implement mft.c::sync_mft_mirror_umount(). We currently will just
20 leave the volume dirty on umount if the final iput(vol->mft_ino)
21 causes a write of any mirrored mft records due to the mft mirror
22 inode having been discarded already. Whether this can actually ever
23 happen is unclear however so it is worth waiting until someone hits
24 the problem.
25 - Enable the code for setting the NT4 compatibility flag when we start
26 making NTFS 1.2 specific modifications.
27
282.1.23-WIP
29
30 - Add printk rate limiting for ntfs_warning() and ntfs_error() when
31 compiled without debug. This avoids a possible denial of service
32 attack. Thanks to Carl-Daniel Hailfinger from SuSE for pointing this
33 out.
34
352.1.22 - Many bug and race fixes and error handling improvements.
36
37 - Improve error handling in fs/ntfs/inode.c::ntfs_truncate().
38 - Change fs/ntfs/inode.c::ntfs_truncate() to return an error code
39 instead of void and provide a helper ntfs_truncate_vfs() for the
40 vfs ->truncate method.
41 - Add a new ntfs inode flag NInoTruncateFailed() and modify
42 fs/ntfs/inode.c::ntfs_truncate() to set and clear it appropriately.
43 - Fix min_size and max_size definitions in ATTR_DEF structure in
44 fs/ntfs/layout.h to be signed.
45 - Add attribute definition handling helpers to fs/ntfs/attrib.[hc]:
46 ntfs_attr_size_bounds_check(), ntfs_attr_can_be_non_resident(), and
47 ntfs_attr_can_be_resident(), which in turn use the new private helper
48 ntfs_attr_find_in_attrdef().
49 - In fs/ntfs/aops.c::mark_ntfs_record_dirty(), take the
50 mapping->private_lock around the dirtying of the buffer heads
51 analagous to the way it is done in __set_page_dirty_buffers().
52 - Ensure the mft record size does not exceed the PAGE_CACHE_SIZE at
53 mount time as this cannot work with the current implementation.
54 - Check for location of attribute name and improve error handling in
55 general in fs/ntfs/inode.c::ntfs_read_locked_inode() and friends.
56 - In fs/ntfs/aops.c::ntfs_writepage(), if the page is fully outside
57 i_size, i.e. race with truncate, invalidate the buffers on the page
58 so that they become freeable and hence the page does not leak.
59 - Remove unused function fs/ntfs/runlist.c::ntfs_rl_merge(). (Adrian
60 Bunk)
61 - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_find() that resulted in
62 a NULL pointer dereference in the error code path when a corrupt
63 attribute was found. (Thanks to Domen Puncer for the bug report.)
64 - Add MODULE_VERSION() to fs/ntfs/super.c.
65 - Make several functions and variables static. (Adrian Bunk)
66 - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() so it allocates
67 buffers for the page if they are not present and then marks the
68 buffers belonging to the ntfs record dirty. This causes the buffers
69 to become busy and hence they are safe from removal until the page
70 has been written out.
71 - Fix stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find() in the
72 error handling code path that resulted in a BUG() due to trying to
73 unmap an extent mft record when the mapping of it had failed and it
74 thus was not mapped. (Thanks to Ken MacFerrin for the bug report.)
75 - Drop the runlist lock after the vcn has been read in
76 fs/ntfs/lcnalloc.c::__ntfs_cluster_free().
77 - Rewrite handling of multi sector transfer errors. We now do not set
78 PageError() when such errors are detected in the async i/o handler
79 fs/ntfs/aops.c::ntfs_end_buffer_async_read(). All users of mst
80 protected attributes now check the magic of each ntfs record as they
81 use it and act appropriately. This has the effect of making errors
82 granular per ntfs record rather than per page which solves the case
83 where we cannot access any of the ntfs records in a page when a
84 single one of them had an mst error. (Thanks to Ken MacFerrin for
85 the bug report.)
86 - Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date()
87 where we failed to release i_sem on the $Quota/$Q attribute inode.
88 - Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup().
89 - Add mapping of unmapped buffers to all remaining code paths, i.e.
90 fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(),
91 and write_mft_record_nolock(). From now on we require that the
92 complete runlist for the mft mirror is always mapped into memory.
93 - Add creation of buffers to fs/ntfs/mft.c::ntfs_sync_mft_mirror().
94 - Improve error handling in fs/ntfs/aops.c::ntfs_{read,write}_block().
95 - Cleanup fs/ntfs/aops.c::ntfs_{read,write}page() since we know that a
96 resident attribute will be smaller than a page which makes the code
97 simpler. Also make the code more tolerant to concurrent ->truncate.
98
992.1.21 - Fix some races and bugs, rewrite mft write code, add mft allocator.
100
101 - Implement extent mft record deallocation
102 fs/ntfs/mft.c::ntfs_extent_mft_record_free().
103 - Splitt runlist related functions off from attrib.[hc] to runlist.[hc].
104 - Add vol->mft_data_pos and initialize it at mount time.
105 - Rename init_runlist() to ntfs_init_runlist(), ntfs_vcn_to_lcn() to
106 ntfs_rl_vcn_to_lcn(), decompress_mapping_pairs() to
107 ntfs_mapping_pairs_decompress(), ntfs_merge_runlists() to
108 ntfs_runlists_merge() and adapt all callers.
109 - Add fs/ntfs/runlist.[hc]::ntfs_get_nr_significant_bytes(),
110 ntfs_get_size_for_mapping_pairs(), ntfs_write_significant_bytes(),
111 and ntfs_mapping_pairs_build(), adapted from libntfs.
112 - Make fs/ntfs/lcnalloc.c::ntfs_cluster_free_from_rl_nolock() not
113 static and add a declaration for it to lcnalloc.h.
114 - Add fs/ntfs/lcnalloc.h::ntfs_cluster_free_from_rl() which is a static
115 inline wrapper for ntfs_cluster_free_from_rl_nolock() which takes the
116 cluster bitmap lock for the duration of the call.
117 - Add fs/ntfs/attrib.[hc]::ntfs_attr_record_resize().
118 - Implement the equivalent of memset() for an ntfs attribute in
119 fs/ntfs/attrib.[hc]::ntfs_attr_set() and switch
120 fs/ntfs/logfile.c::ntfs_empty_logfile() to using it.
121 - Remove unnecessary casts from LCN_* constants.
122 - Implement fs/ntfs/runlist.c::ntfs_rl_truncate_nolock().
123 - Add MFT_RECORD_OLD as a copy of MFT_RECORD in fs/ntfs/layout.h and
124 change MFT_RECORD to contain the NTFS 3.1+ specific fields.
125 - Add a helper function fs/ntfs/aops.c::mark_ntfs_record_dirty() which
126 marks all buffers belonging to an ntfs record dirty, followed by
127 marking the page the ntfs record is in dirty and also marking the vfs
128 inode containing the ntfs record dirty (I_DIRTY_PAGES).
129 - Switch fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to using the
130 new helper fs/ntfs/aops.c::mark_ntfs_record_dirty() and remove the no
131 longer needed fs/ntfs/index.[hc]::__ntfs_index_entry_mark_dirty().
132 - Move ntfs_{un,}map_page() from ntfs.h to aops.h and fix resulting
133 include errors.
134 - Move the typedefs for runlist_element and runlist from types.h to
135 runlist.h and fix resulting include errors.
136 - Remove unused {__,}format_mft_record() from fs/ntfs/mft.c.
137 - Modify fs/ntfs/mft.c::__mark_mft_record_dirty() to use the helper
138 mark_ntfs_record_dirty() which also changes the behaviour in that we
139 now set the buffers belonging to the mft record dirty as well as the
140 page itself.
141 - Update fs/ntfs/mft.c::write_mft_record_nolock() and sync_mft_mirror()
142 to cope with the fact that there now are dirty buffers in mft pages.
143 - Update fs/ntfs/inode.c::ntfs_write_inode() to also use the helper
144 mark_ntfs_record_dirty() and thus to set the buffers belonging to the
145 mft record dirty as well as the page itself.
146 - Fix compiler warnings on x86-64 in fs/ntfs/dir.c. (Randy Dunlap,
147 slightly modified by me)
148 - Add fs/ntfs/mft.c::try_map_mft_record() which fails with -EALREADY if
149 the mft record is already locked and otherwise behaves the same way
150 as fs/ntfs/mft.c::map_mft_record().
151 - Modify fs/ntfs/mft.c::write_mft_record_nolock() so that it only
152 writes the mft record if the buffers belonging to it are dirty.
153 Otherwise we assume that it was written out by other means already.
154 - Attempting to write outside initialized size is _not_ a bug so remove
155 the bug check from fs/ntfs/aops.c::ntfs_write_mst_block(). It is in
156 fact required to write outside initialized size when preparing to
157 extend the initialized size.
158 - Map the page instead of using page_address() before writing to it in
159 fs/ntfs/aops.c::ntfs_mft_writepage().
160 - Provide exclusion between opening an inode / mapping an mft record
161 and accessing the mft record in fs/ntfs/mft.c::ntfs_mft_writepage()
162 by setting the page not uptodate throughout ntfs_mft_writepage().
163 - Clear the page uptodate flag in fs/ntfs/aops.c::ntfs_write_mst_block()
164 to ensure noone can see the page whilst the mst fixups are applied.
165 - Add the helper fs/ntfs/mft.c::ntfs_may_write_mft_record() which
166 checks if an mft record may be written out safely obtaining any
167 necessary locks in the process. This is used by
168 fs/ntfs/aops.c::ntfs_write_mst_block().
169 - Modify fs/ntfs/aops.c::ntfs_write_mst_block() to also work for
170 writing mft records and improve its error handling in the process.
171 Now if any of the records in the page fail to be written out, all
172 other records will be written out instead of aborting completely.
173 - Remove ntfs_mft_aops and update all users to use ntfs_mst_aops.
174 - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to set the
175 ntfs_mst_aops for all inodes which are NInoMstProtected() and
176 ntfs_aops for all other inodes.
177 - Rename fs/ntfs/mft.c::sync_mft_mirror{,_umount}() to
178 ntfs_sync_mft_mirror{,_umount}() and change their parameters so they
179 no longer require an ntfs inode to be present. Update all callers.
180 - Cleanup the error handling in fs/ntfs/mft.c::ntfs_sync_mft_mirror().
181 - Clear the page uptodate flag in fs/ntfs/mft.c::ntfs_sync_mft_mirror()
182 to ensure noone can see the page whilst the mst fixups are applied.
183 - Remove the no longer needed fs/ntfs/mft.c::ntfs_mft_writepage() and
184 fs/ntfs/mft.c::try_map_mft_record().
185 - Fix callers of fs/ntfs/aops.c::mark_ntfs_record_dirty() to call it
186 with the ntfs inode which contains the page rather than the ntfs
187 inode the mft record of which is in the page.
188 - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by moving the
189 index inode bitmap inode release code from there to
190 fs/ntfs/inode.c::ntfs_clear_big_inode(). (Thanks to Christoph
191 Hellwig for spotting this.)
192 - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by taking the
193 inode semaphore around the code that sets ni->itype.index.bmp_ino to
194 NULL and reorganize the code to optimize it a bit. (Thanks to
195 Christoph Hellwig for spotting this.)
196 - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() to no longer take the
197 ntfs inode as a parameter as this is confusing and misleading and the
198 needed ntfs inode is available via NTFS_I(page->mapping->host).
199 Adapt all callers to this change.
200 - Modify fs/ntfs/mft.c::write_mft_record_nolock() and
201 fs/ntfs/aops.c::ntfs_write_mst_block() to only check the dirty state
202 of the first buffer in a record and to take this as the ntfs record
203 dirty state. We cannot look at the dirty state for subsequent
204 buffers because we might be racing with
205 fs/ntfs/aops.c::mark_ntfs_record_dirty().
206 - Move the static inline ntfs_init_big_inode() from fs/ntfs/inode.c to
207 inode.h and make fs/ntfs/inode.c::__ntfs_init_inode() non-static and
208 add a declaration for it to inode.h. Fix some compilation issues
209 that resulted due to #includes and header file interdependencies.
210 - Simplify setup of i_mode in fs/ntfs/inode.c::ntfs_read_locked_inode().
211 - Add helpers fs/ntfs/layout.h::MK_MREF() and MK_LE_MREF().
212 - Modify fs/ntfs/mft.c::map_extent_mft_record() to only verify the mft
213 record sequence number if it is specified (i.e. not zero).
214 - Add fs/ntfs/mft.[hc]::ntfs_mft_record_alloc() and various helper
215 functions used by it.
216 - Update Documentation/filesystems/ntfs.txt with instructions on how to
217 use the Device-Mapper driver with NTFS ftdisk/LDM raid. This removes
218 the linear raid problem with the Software RAID / MD driver when one
219 or more of the devices has an odd number of sectors.
220
2212.1.20 - Fix two stupid bugs introduced in 2.1.18 release.
222
223 - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_reinit_search_ctx()
224 where we did not clear ctx->al_entry but it was still set due to
225 changes in ntfs_attr_lookup() and ntfs_external_attr_find() in
226 particular.
227 - Fix another stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find()
228 where we forgot to unmap the extent mft record when we had finished
229 enumerating an attribute which caused a bug check to trigger when the
230 VFS calls ->clear_inode.
231
2322.1.19 - Many cleanups, improvements, and a minor bug fix.
233
234 - Update ->setattr (fs/ntfs/inode.c::ntfs_setattr()) to refuse to
235 change the uid, gid, and mode of an inode as we do not support NTFS
236 ACLs yet.
237 - Remove BKL use from ntfs_setattr() syncing up with the rest of the
238 kernel.
239 - Get rid of the ugly transparent union in fs/ntfs/dir.c::ntfs_readdir()
240 and ntfs_filldir() as per suggestion from Al Viro.
241 - Change '\0' and L'\0' to simply 0 as per advice from Linus Torvalds.
242 - Update ->truncate (fs/ntfs/inode.c::ntfs_truncate()) to check if the
243 inode size has changed and to only output an error if so.
244 - Rename fs/ntfs/attrib.h::attribute_value_length() to ntfs_attr_size().
245 - Add le{16,32,64} as well as sle{16,32,64} data types to
246 fs/ntfs/types.h.
247 - Change ntfschar to be le16 instead of u16 in fs/ntfs/types.h.
248 - Add le versions of VCN, LCN, and LSN called leVCN, leLCN, and leLSN,
249 respectively, to fs/ntfs/types.h.
250 - Update endianness conversion macros in fs/ntfs/endian.h to use the
251 new types as appropriate.
252 - Do proper type casting when using sle64_to_cpup() in fs/ntfs/dir.c
253 and index.c.
254 - Add leMFT_REF data type to fs/ntfs/layout.h.
255 - Update all NTFS header files with the new little endian data types.
256 Affected files are fs/ntfs/layout.h, logfile.h, and time.h.
257 - Do proper type casting when using ntfs_is_*_recordp() in
258 fs/ntfs/logfile.c, mft.c, and super.c.
259 - Fix all the sparse bitwise warnings. Had to change all the typedef
260 enums storing little endian values to simple enums plus a typedef for
261 the datatype to make sparse happy.
262 - Fix a bug found by the new sparse bitwise warnings where the default
263 upcase table was defined as a pointer to wchar_t rather than ntfschar
264 in fs/ntfs/ntfs.h and super.c.
265 - Change {const_,}cpu_to_le{16,32}(0) to just 0 as suggested by Al Viro.
266
2672.1.18 - Fix scheduling latencies at mount time as well as an endianness bug.
268
269 - Remove vol->nr_mft_records as it was pretty meaningless and optimize
270 the calculation of total/free inodes as used by statfs().
271 - Fix scheduling latencies in ntfs_fill_super() by dropping the BKL
272 because the code itself is using the ntfs_lock semaphore which
273 provides safe locking. (Ingo Molnar)
274 - Fix a potential bug in fs/ntfs/mft.c::map_extent_mft_record() that
275 could occur in the future for when we start closing/freeing extent
276 inodes if we don't set base_ni->ext.extent_ntfs_inos to NULL after
277 we free it.
278 - Rename {find,lookup}_attr() to ntfs_attr_{find,lookup}() as well as
279 find_external_attr() to ntfs_external_attr_find() to cleanup the
280 namespace a bit and to be more consistent with libntfs.
281 - Rename {{re,}init,get,put}_attr_search_ctx() to
282 ntfs_attr_{{re,}init,get,put}_search_ctx() as well as the type
283 attr_search_context to ntfs_attr_search_ctx.
284 - Force use of ntfs_attr_find() in ntfs_attr_lookup() when searching
285 for the attribute list attribute itself.
286 - Fix endianness bug in ntfs_external_attr_find().
287 - Change ntfs_{external_,}attr_find() to return 0 on success, -ENOENT
288 if the attribute is not found, and -EIO on real error. In the case
289 of -ENOENT, the search context is updated to describe the attribute
290 before which the attribute being searched for would need to be
291 inserted if such an action were to be desired and in the case of
292 ntfs_external_attr_find() the search context is also updated to
293 indicate the attribute list entry before which the attribute list
294 entry of the attribute being searched for would need to be inserted
295 if such an action were to be desired. Also make ntfs_find_attr()
296 static and remove its prototype from attrib.h as it is not used
297 anywhere other than attrib.c. Update ntfs_attr_lookup() and all
298 callers of ntfs_{external,}attr_{find,lookup}() for the new return
299 values.
300 - Minor cleanup of fs/ntfs/inode.c::ntfs_init_locked_inode().
301
3022.1.17 - Fix bugs in mount time error code paths and other updates.
303
304 - Implement bitmap modification code (fs/ntfs/bitmap.[hc]). This
305 includes functions to set/clear a single bit or a run of bits.
306 - Add fs/ntfs/attrib.[hc]::ntfs_find_vcn() which returns the locked
307 runlist element containing a particular vcn. It also takes care of
308 mapping any needed runlist fragments.
309 - Implement cluster (de-)allocation code (fs/ntfs/lcnalloc.[hc]).
310 - Load attribute definition table from $AttrDef at mount time.
311 - Fix bugs in mount time error code paths involving (de)allocation of
312 the default and volume upcase tables.
313 - Remove ntfs_nr_mounts as it is no longer used.
314
3152.1.16 - Implement access time updates, file sync, async io, and read/writev.
316
317 - Add support for readv/writev and aio_read/aio_write (fs/ntfs/file.c).
318 This is done by setting the appropriate file operations pointers to
319 the generic helper functions provided by mm/filemap.c.
320 - Implement fsync, fdatasync, and msync both for files (fs/ntfs/file.c)
321 and directories (fs/ntfs/dir.c).
322 - Add support for {a,m,c}time updates to inode.c::ntfs_write_inode().
323 Note, except for the root directory and any other system files opened
324 by the user, the system files will not have their access times
325 updated as they are only accessed at the inode level an hence the
326 file level functions which cause the times to be updated are never
327 invoked.
328
3292.1.15 - Invalidate quotas when (re)mounting read-write.
330
331 - Add new element itype.index.collation_rule to the ntfs inode
332 structure and set it appropriately in ntfs_read_locked_inode().
333 - Implement a new inode type "index" to allow efficient access to the
334 indices found in various system files and adapt inode handling
335 accordingly (fs/ntfs/inode.[hc]). An index inode is essentially an
336 attribute inode (NInoAttr() is true) with an attribute type of
337 AT_INDEX_ALLOCATION. As such, it is no longer allowed to call
338 ntfs_attr_iget() with an attribute type of AT_INDEX_ALLOCATION as
339 there would be no way to distinguish between normal attribute inodes
340 and index inodes. The function to obtain an index inode is
341 ntfs_index_iget() and it uses the helper function
342 ntfs_read_locked_index_inode(). Note, we do not overload
343 ntfs_attr_iget() as indices consist of multiple attributes so using
344 ntfs_attr_iget() to obtain an index inode would be confusing.
345 - Ensure that there is no overflow when doing page->index <<
346 PAGE_CACHE_SHIFT by casting page->index to s64 in fs/ntfs/aops.c.
347 - Use atomic kmap instead of kmap() in fs/ntfs/aops.c::ntfs_read_page()
348 and ntfs_read_block().
349 - Use case sensitive attribute lookups instead of case insensitive ones.
350 - Lock all page cache pages belonging to mst protected attributes while
351 accessing them to ensure we never see corrupt data while the page is
352 under writeout.
353 - Add framework for generic ntfs collation (fs/ntfs/collation.[hc]).
354 We have ntfs_is_collation_rule_supported() to check if the collation
355 rule you want to use is supported and ntfs_collation() which actually
356 collates two data items. We currently only support COLLATION_BINARY
357 and COLLATION_NTOFS_ULONG but support for other collation rules will
358 be added as the need arises.
359 - Add a new type, ntfs_index_context, to allow retrieval of an index
360 entry using the corresponding index key. To get an index context,
361 use ntfs_index_ctx_get() and to release it, use ntfs_index_ctx_put().
362 This also adds a new slab cache for the index contexts. To lookup a
363 key in an index inode, use ntfs_index_lookup(). After modifying an
364 index entry, call ntfs_index_entry_flush_dcache_page() followed by
365 ntfs_index_entry_mark_dirty() to ensure the changes are written out
366 to disk. For details see fs/ntfs/index.[hc]. Note, at present, if
367 an index entry is in the index allocation attribute rather than the
368 index root attribute it will not be written out (you will get a
369 warning message about discarded changes instead).
370 - Load the quota file ($Quota) and check if quota tracking is enabled
371 and if so, mark the quotas out of date. This causes windows to
372 rescan the volume on boot and update all quota entries.
373 - Add a set_page_dirty address space operation for ntfs_m[fs]t_aops.
374 It is simply set to __set_page_dirty_nobuffers() to make sure that
375 running set_page_dirty() on a page containing mft/ntfs records will
376 not affect the dirty state of the page buffers.
377 - Add fs/ntfs/index.c::__ntfs_index_entry_mark_dirty() which sets all
378 buffers that are inside the ntfs record in the page dirty after which
379 it sets the page dirty. This allows ->writepage to only write the
380 dirty index records rather than having to write all the records in
381 the page. Modify fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to
382 use this rather than __set_page_dirty_nobuffers().
383 - Implement fs/ntfs/aops.c::ntfs_write_mst_block() which enables the
384 writing of page cache pages belonging to mst protected attributes
385 like the index allocation attribute in directory indices and other
386 indices like $Quota/$Q, etc. This means that the quota is now marked
387 out of date on all volumes rather than only on ones where the quota
388 defaults entry is in the index root attribute of the $Quota/$Q index.
389
3902.1.14 - Fix an NFSd caused deadlock reported by several users.
391
392 - Modify fs/ntfs/ntfs_readdir() to copy the index root attribute value
393 to a buffer so that we can put the search context and unmap the mft
394 record before calling the filldir() callback. We need to do this
395 because of NFSd which calls ->lookup() from its filldir callback()
396 and this causes NTFS to deadlock as ntfs_lookup() maps the mft record
397 of the directory and since ntfs_readdir() has got it mapped already
398 ntfs_lookup() deadlocks.
399
4002.1.13 - Enable overwriting of resident files and housekeeping of system files.
401
402 - Implement writing of mft records (fs/ntfs/mft.[hc]), which includes
403 keeping the mft mirror in sync with the mft when mirrored mft records
404 are written. The functions are write_mft_record{,_nolock}(). The
405 implementation is quite rudimentary for now with lots of things not
406 implemented yet but I am not sure any of them can actually occur so
407 I will wait for people to hit each one and only then implement it.
408 - Commit open system inodes at umount time. This should make it
409 virtually impossible for sync_mft_mirror_umount() to ever be needed.
410 - Implement ->write_inode (fs/ntfs/inode.c::ntfs_write_inode()) for the
411 ntfs super operations. This gives us inode writing via the VFS inode
412 dirty code paths. Note: Access time updates are not implemented yet.
413 - Implement fs/ntfs/mft.[hc]::{,__}mark_mft_record_dirty() and make
414 fs/ntfs/aops.c::ntfs_writepage() and ntfs_commit_write() use it, thus
415 finally enabling resident file overwrite! (-8 This also includes a
416 placeholder for ->writepage (ntfs_mft_writepage()), which for now
417 just redirties the page and returns. Also, at umount time, we for
418 now throw away all mft data page cache pages after the last call to
419 ntfs_commit_inode() in the hope that all inodes will have been
420 written out by then and hence no dirty (meta)data will be lost. We
421 also check for this case and emit an error message telling the user
422 to run chkdsk.
423 - Use set_page_writeback() and end_page_writeback() in the resident
424 attribute code path of fs/ntfs/aops.c::ntfs_writepage() otherwise
425 the radix-tree tag PAGECACHE_TAG_DIRTY remains set even though the
426 page is clean.
427 - Implement ntfs_mft_writepage() so it now checks if any of the mft
428 records in the page are dirty and if so redirties the page and
429 returns. Otherwise it just returns (after doing set_page_writeback(),
430 unlock_page(), end_page_writeback() or the radix-tree tag
431 PAGECACHE_TAG_DIRTY remains set even though the page is clean), thus
432 alowing the VM to do with the page as it pleases. Also, at umount
433 time, now only throw away dirty mft (meta)data pages if dirty inodes
434 are present and ask the user to email us if they see this happening.
435 - Add functions ntfs_{clear,set}_volume_flags(), to modify the volume
436 information flags (fs/ntfs/super.c).
437 - Mark the volume dirty when (re)mounting read-write and mark it clean
438 when unmounting or remounting read-only. If any volume errors are
439 found, the volume is left marked dirty to force chkdsk to run.
440 - Add code to set the NT4 compatibility flag when (re)mounting
441 read-write for newer NTFS versions but leave it commented out for now
442 since we do not make any modifications that are NTFS 1.2 specific yet
443 and since setting this flag breaks Captive-NTFS which is not nice.
444 This code must be enabled once we start writing NTFS 1.2 specific
445 changes otherwise Windows NTFS driver might crash / cause corruption.
446
4472.1.12 - Fix the second fix to the decompression engine and some cleanups.
448
449 - Add a new address space operations struct, ntfs_mst_aops, for mst
450 protected attributes. This is because the default ntfs_aops do not
451 make sense with mst protected data and were they to write anything to
452 such an attribute they would cause data corruption so we provide
453 ntfs_mst_aops which does not have any write related operations set.
454 - Cleanup dirty ntfs inode handling (fs/ntfs/inode.[hc]) which also
455 includes an adapted ntfs_commit_inode() and an implementation of
456 ntfs_write_inode() which for now just cleans dirty inodes without
457 writing them (it does emit a warning that this is happening).
458 - Undo the second decompression engine fix (see 2.1.9 release ChangeLog
459 entry) as it was only fixing a theoretical bug but at the same time
460 it badly broke the handling of sparse and uncompressed compression
461 blocks.
462
4632.1.11 - Driver internal cleanups.
464
465 - Only build logfile.o if building the driver with read-write support.
466 - Really final white space cleanups.
467 - Use generic_ffs() instead of ffs() in logfile.c which allows the
468 log_page_size variable to be optimized by gcc into a constant.
469 - Rename uchar_t to ntfschar everywhere as uchar_t is unsigned 1-byte
470 char as defined by POSIX and as found on some systems.
471
4722.1.10 - Force read-only (re)mounting of volumes with unsupported volume flags.
473
474 - Finish off the white space cleanups (remove trailing spaces, etc).
475 - Clean up ntfs_fill_super() and ntfs_read_inode_mount() by removing
476 the kludges around the first iget(). Instead of (re)setting ->s_op
477 we have the $MFT inode set up by explicit new_inode() / set ->i_ino /
478 insert_inode_hash() / call ntfs_read_inode_mount() directly. This
479 kills the need for second super_operations and allows to return error
480 from ntfs_read_inode_mount() without resorting to ugly "poisoning"
481 tricks. (Al Viro)
482 - Force read-only (re)mounting if any of the following bits are set in
483 the volume information flags:
484 VOLUME_IS_DIRTY, VOLUME_RESIZE_LOG_FILE,
485 VOLUME_UPGRADE_ON_MOUNT, VOLUME_DELETE_USN_UNDERWAY,
486 VOLUME_REPAIR_OBJECT_ID, VOLUME_MODIFIED_BY_CHKDSK
487 To make this easier we define VOLUME_MUST_MOUNT_RO_MASK with all the
488 above bits set so the test is made easy.
489
4902.1.9 - Fix two bugs in decompression engine.
491
492 - Fix a bug where we would not always detect that we have reached the
493 end of a compression block because we were ending at minus one byte
494 which is effectively the same as being at the end. The fix is to
495 check whether the uncompressed buffer has been fully filled and if so
496 we assume we have reached the end of the compression block. A big
497 thank you to Marcin Gibuła for the bug report, the assistance in
498 tracking down the bug and testing the fix.
499 - Fix a possible bug where when a compressed read is truncated to the
500 end of the file, the offset inside the last page was not truncated.
501
5022.1.8 - Handle $MFT mirror and $LogFile, improve time handling, and cleanups.
503
504 - Use get_bh() instead of manual atomic_inc() in fs/ntfs/compress.c.
505 - Modify fs/ntfs/time.c::ntfs2utc(), get_current_ntfs_time(), and
506 utc2ntfs() to work with struct timespec instead of time_t on the
507 Linux UTC time side thus preserving the full precision of the NTFS
508 time and only loosing up to 99 nano-seconds in the Linux UTC time.
509 - Move fs/ntfs/time.c to fs/ntfs/time.h and make the time functions
510 static inline.
511 - Remove unused ntfs_dirty_inode().
512 - Cleanup super operations declaration in fs/ntfs/super.c.
513 - Wrap flush_dcache_mft_record_page() in #ifdef NTFS_RW.
514 - Add NInoTestSetFoo() and NInoTestClearFoo() macro magic to
515 fs/ntfs/inode.h and use it to declare NInoTest{Set,Clear}Dirty.
516 - Move typedefs for ntfs_attr and test_t from fs/ntfs/inode.c to
517 fs/ntfs/inode.h so they can be used elsewhere.
518 - Determine the mft mirror size as the number of mirrored mft records
519 and store it in ntfs_volume->mftmirr_size (fs/ntfs/super.c).
520 - Load the mft mirror at mount time and compare the mft records stored
521 in it to the ones in the mft. Force a read-only mount if the two do
522 not match (fs/ntfs/super.c).
523 - Fix type casting related warnings on 64-bit architectures. Thanks
524 to Meelis Roos for reporting them.
525 - Move %L to %ll as %L is floating point and %ll is integer which is
526 what we want.
527 - Read the journal ($LogFile) and determine if the volume has been
528 shutdown cleanly and force a read-only mount if not (fs/ntfs/super.c
529 and fs/ntfs/logfile.c). This is a little bit of a crude check in
530 that we only look at the restart areas and not at the actual log
531 records so that there will be a very small number of cases where we
532 think that a volume is dirty when in fact it is clean. This should
533 only affect volumes that have not been shutdown cleanly and did not
534 have any pending, non-check-pointed i/o.
535 - If the $LogFile indicates a clean shutdown and a read-write (re)mount
536 is requested, empty $LogFile by overwriting it with 0xff bytes to
537 ensure that Windows cannot cause data corruption by replaying a stale
538 journal after Linux has written to the volume.
539
5402.1.7 - Enable NFS exporting of mounted NTFS volumes.
541
542 - Set i_generation in the VFS inode from the seq_no of the NTFS inode.
543 - Make ntfs_lookup() NFS export safe, i.e. use d_splice_alias(), etc.
544 - Implement ->get_dentry() in fs/ntfs/namei.c::ntfs_get_dentry() as the
545 default doesn't allow inode number 0 which is a valid inode on NTFS
546 and even if it did allow that it uses iget() instead of ntfs_iget()
547 which makes it useless for us.
548 - Implement ->get_parent() in fs/ntfs/namei.c::ntfs_get_parent() as the
549 default just returns -EACCES which is not very useful.
550 - Define export operations (->s_export_op) for NTFS (ntfs_export_ops)
551 and set them up in the super block at mount time (super.c) this
552 allows mounted NTFS volumes to be exported via NFS.
553 - Add missing return -EOPNOTSUPP; in
554 fs/ntfs/aops.c::ntfs_commit_nonresident_write().
555 - Enforce no atime and no dir atime updates at mount/remount time as
556 they are not implemented yet anyway.
557 - Move a few assignments in fs/ntfs/attrib.c::load_attribute_list() to
558 after a NULL check. Thanks to Dave Jones for pointing this out.
559
5602.1.6 - Fix minor bug in handling of compressed directories.
561
562 - Fix bug in handling of compressed directories. A compressed
563 directory is not really compressed so when we set the ->i_blocks
564 field of a compressed directory inode we were setting it from the
565 non-existing field ni->itype.compressed.size which gave random
566 results... For directories we now always use ni->allocated_size.
567
5682.1.5 - Fix minor bug in attribute list attribute handling.
569
570 - Fix bug in attribute list handling. Actually it is not as much a bug
571 as too much protection in that we were not allowing attribute lists
572 which waste space on disk while Windows XP clearly allows it and in
573 fact creates such attribute lists so our driver was failing.
574 - Update NTFS documentation ready for 2.6 kernel release.
575
5762.1.4 - Reduce compiler requirements.
577
578 - Remove all uses of unnamed structs and unions in the driver to make
579 old and newer gcc versions happy. Makes it a bit uglier IMO but at
580 least people will stop hassling me about it.
581
5822.1.3 - Important bug fixes in corner cases.
583
584 - super.c::parse_ntfs_boot_sector(): Correct the check for 64-bit
585 clusters. (Philipp Thomas)
586 - attrib.c::load_attribute_list(): Fix bug when initialized_size is a
587 multiple of the block_size but not the cluster size. (Szabolcs
588 Szakacsits <szaka@sienet.hu>)
589
5902.1.2 - Important bug fixes aleviating the hangs in statfs.
591
592 - Fix buggy free cluster and free inode determination logic.
593
5942.1.1 - Minor updates.
595
596 - Add handling for initialized_size != data_size in compressed files.
597 - Reduce function local stack usage from 0x3d4 bytes to just noise in
598 fs/ntfs/upcase.c. (Randy Dunlap <rddunlap@osdl.ord>)
599 - Remove compiler warnings for newer gcc.
600 - Pages are no longer kmapped by mm/filemap.c::generic_file_write()
601 around calls to ->{prepare,commit}_write. Adapt NTFS appropriately
602 in fs/ntfs/aops.c::ntfs_prepare_nonresident_write() by using
603 kmap_atomic(KM_USER0).
604
6052.1.0 - First steps towards write support: implement file overwrite.
606
607 - Add configuration option for developmental write support with an
608 appropriately scary configuration help text.
609 - Initial implementation of fs/ntfs/aops.c::ntfs_writepage() and its
610 helper fs/ntfs/aops.c::ntfs_write_block(). This enables mmap(2) based
611 overwriting of existing files on ntfs. Note: Resident files are
612 only written into memory, and not written out to disk at present, so
613 avoid writing to files smaller than about 1kiB.
614 - Initial implementation of fs/ntfs/aops.c::ntfs_prepare_write(), its
615 helper fs/ntfs/aops.c::ntfs_prepare_nonresident_write() and their
616 counterparts, fs/ntfs/aops.c::ntfs_commit_write(), and
617 fs/ntfs/aops.c::ntfs_commit_nonresident_write(), respectively. Also,
618 add generic_file_write() to the ntfs file operations (fs/ntfs/file.c).
619 This enables write(2) based overwriting of existing files on ntfs.
620 Note: As with mmap(2) based overwriting, resident files are only
621 written into memory, and not written out to disk at present, so avoid
622 writing to files smaller than about 1kiB.
623 - Implement ->truncate (fs/ntfs/inode.c::ntfs_truncate()) and
624 ->setattr() (fs/ntfs/inode.c::ntfs_setattr()) inode operations for
625 files with the purpose of intercepting and aborting all i_size
626 changes which we do not support yet. ntfs_truncate() actually only
627 emits a warning message but AFAICS our interception of i_size changes
628 elsewhere means ntfs_truncate() never gets called for i_size changes.
629 It is only called from generic_file_write() when we fail in
630 ntfs_prepare_{,nonresident_}write() in order to discard any
631 instantiated buffers beyond i_size. Thus i_size is not actually
632 changed so our warning message is enough. Unfortunately it is not
633 possible to easily determine if i_size is being changed or not hence
634 we just emit an appropriately worded error message.
635
6362.0.25 - Small bug fixes and cleanups.
637
638 - Unlock the page in an out of memory error code path in
639 fs/ntfs/aops.c::ntfs_read_block().
640 - If fs/ntfs/aops.c::ntfs_read_page() is called on an uptodate page,
641 just unlock the page and return. (This can happen due to ->writepage
642 clearing PageUptodate() during write out of MstProtected()
643 attributes.
644 - Remove leaked write code again.
645
6462.0.24 - Cleanups.
647
648 - Treat BUG_ON() as ASSERT() not VERIFY(), i.e. do not use side effects
649 inside BUG_ON(). (Adam J. Richter)
650 - Split logical OR expressions inside BUG_ON() into individual BUG_ON()
651 calls for improved debugging. (Adam J. Richter)
652 - Add errors flag to the ntfs volume state, accessed via
653 NVol{,Set,Clear}Errors(vol).
654 - Do not allow read-write remounts of read-only volumes with errors.
655 - Clarify comment for ntfs file operation sendfile which was added by
656 Christoph Hellwig a while ago (just using generic_file_sendfile())
657 to say that ntfs ->sendfile is only used for the case where the
658 source data is on the ntfs partition and the destination is
659 somewhere else, i.e. nothing we need to concern ourselves with.
660 - Add generic_file_write() as our ntfs file write operation.
661
6622.0.23 - Major bug fixes (races, deadlocks, non-i386 architectures).
663
664 - Massive internal locking changes to mft record locking. Fixes lock
665 recursion and replaces the mrec_lock read/write semaphore with a
666 mutex. Also removes the now superfluous mft_count. This fixes several
667 race conditions and deadlocks, especially in the future write code.
668 - Fix ntfs over loopback for compressed files by adding an
669 optimization barrier. (gcc was screwing up otherwise ?)
670 - Miscellaneous cleanups all over the code and a fix or two in error
671 handling code paths.
672 Thanks go to Christoph Hellwig for pointing out the following two:
673 - Remove now unused function fs/ntfs/malloc.h::vmalloc_nofs().
674 - Fix ntfs_free() for ia64 and parisc by checking for VMALLOC_END, too.
675
6762.0.22 - Cleanups, mainly to ntfs_readdir(), and use C99 initializers.
677
678 - Change fs/ntfs/dir.c::ntfs_reddir() to only read/write ->f_pos once
679 at entry/exit respectively.
680 - Use C99 initializers for structures.
681 - Remove unused variable blocks from fs/ntfs/aops.c::ntfs_read_block().
682
6832.0.21 - Check for, and refuse to work with too large files/directories/volumes.
684
685 - Limit volume size at mount time to 2TiB on architectures where
686 unsigned long is 32-bits (fs/ntfs/super.c::parse_ntfs_boot_sector()).
687 This is the most we can do without overflowing the 32-bit limit of
688 the block device size imposed on us by sb_bread() and sb_getblk()
689 for the time being.
690 - Limit file/directory size at open() time to 16TiB on architectures
691 where unsigned long is 32-bits (fs/ntfs/file.c::ntfs_file_open() and
692 fs/ntfs/dir.c::ntfs_dir_open()). This is the most we can do without
693 overflowing the page cache page index.
694
6952.0.20 - Support non-resident directory index bitmaps, fix page leak in readdir.
696
697 - Move the directory index bitmap to use an attribute inode instead of
698 having special fields for it inside the ntfs inode structure. This
699 means that the index bitmaps now use the page cache for i/o, too,
700 and also as a side effect we get support for non-resident index
701 bitmaps for free.
702 - Simplify/cleanup error handling in fs/ntfs/dir.c::ntfs_readdir() and
703 fix a page leak that manifested itself in some cases.
704 - Add fs/ntfs/inode.c::ntfs_put_inode(), which we need to release the
705 index bitmap inode on the final iput().
706
7072.0.19 - Fix race condition, improvements, and optimizations in i/o interface.
708
709 - Apply block optimization added to fs/ntfs/aops.c::ntfs_read_block()
710 to fs/ntfs/compress.c::ntfs_file_read_compressed_block() as well.
711 - Drop the "file" from ntfs_file_read_compressed_block().
712 - Rename fs/ntfs/aops.c::ntfs_enb_buffer_read_async() to
713 ntfs_end_buffer_async_read() (more like the fs/buffer.c counterpart).
714 - Update ntfs_end_buffer_async_read() with the improved logic from
715 its updated counterpart fs/buffer.c::end_buffer_async_read(). Apply
716 further logic improvements to better determine when we set PageError.
717 - Update submission of buffers in fs/ntfs/aops.c::ntfs_read_block() to
718 check for the buffers being uptodate first in line with the updated
719 fs/buffer.c::block_read_full_page(). This plugs a small race
720 condition.
721
7222.0.18 - Fix race condition in reading of compressed files.
723
724 - There was a narrow window between checking a buffer head for being
725 uptodate and locking it in ntfs_file_read_compressed_block(). We now
726 lock the buffer and then check whether it is uptodate or not.
727
7282.0.17 - Cleanups and optimizations - shrinking the ToDo list.
729
730 - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to return an error
731 code and update callers, i.e. ntfs_iget(), to pass that error code
732 up instead of just using -EIO.
733 - Modifications to super.c to ensure that both mount and remount
734 cannot set any write related options when the driver is compiled
735 read-only.
736 - Optimize block resolution in fs/ntfs/aops.c::ntfs_read_block() to
737 cache the current runlist element. This should improve performance
738 when reading very large and/or very fragmented data.
739
7402.0.16 - Convert access to $MFT/$BITMAP to attribute inode API.
741
742 - Fix a stupid bug introduced in 2.0.15 where we were unmapping the
743 wrong inode in fs/ntfs/inode.c::ntfs_attr_iget().
744 - Fix debugging check in fs/ntfs/aops.c::ntfs_read_block().
745 - Convert $MFT/$BITMAP access to attribute inode API and remove all
746 remnants of the ugly mftbmp address space and operations hack. This
747 means we finally have only one readpage function as well as only one
748 async io completion handler. Yey! The mft bitmap is now just an
749 attribute inode and is accessed from vol->mftbmp_ino just as if it
750 were a normal file. Fake inodes rule. (-:
751
7522.0.15 - Fake inodes based attribute i/o via the pagecache, fixes and cleanups.
753
754 - Fix silly bug in fs/ntfs/super.c::parse_options() which was causing
755 remounts to fail when the partition had an entry in /etc/fstab and
756 the entry specified the nls= option.
757 - Apply same macro magic used in fs/ntfs/inode.h to fs/ntfs/volume.h to
758 expand all the helper functions NVolFoo(), NVolSetFoo(), and
759 NVolClearFoo().
760 - Move copyright statement from driver initialisation message to
761 module description (fs/super.c). This makes the initialisation
762 message fit on one line and fits in better with rest of kernel.
763 - Update fs/ntfs/attrib.c::map_run_list() to work on both real and
764 attribute inodes, and both for files and directories.
765 - Implement fake attribute inodes allowing all attribute i/o to go via
766 the page cache and to use all the normal vfs/mm functionality:
767 - Add ntfs_attr_iget() and its helper ntfs_read_locked_attr_inode()
768 to fs/ntfs/inode.c.
769 - Add needed cleanup code to ntfs_clear_big_inode().
770 - Merge address space operations for files and directories (aops.c),
771 now just have ntfs_aops:
772 - Rename:
773 end_buffer_read_attr_async() -> ntfs_end_buffer_read_async(),
774 ntfs_attr_read_block() -> ntfs_read_block(),
775 ntfs_file_read_page() -> ntfs_readpage().
776 - Rewrite fs/ntfs/aops.c::ntfs_readpage() to work on both real and
777 attribute inodes, and both for files and directories.
778 - Remove obsolete fs/ntfs/aops.c::ntfs_mst_readpage().
779
7802.0.14 - Run list merging code cleanup, minor locking changes, typo fixes.
781
782 - Change fs/ntfs/super.c::ntfs_statfs() to not rely on BKL by moving
783 the locking out of super.c::get_nr_free_mft_records() and taking and
784 dropping the mftbmp_lock rw_semaphore in ntfs_statfs() itself.
785 - Bring attribute runlist merging code (fs/ntfs/attrib.c) in sync with
786 current userspace ntfs library code. This means that if a merge
787 fails the original runlists are always left unmodified instead of
788 being silently corrupted.
789 - Misc typo fixes.
790
7912.0.13 - Use iget5_locked() in preparation for fake inodes and small cleanups.
792
793 - Remove nr_mft_bits and the now superfluous union with nr_mft_records
794 from ntfs_volume structure.
795 - Remove nr_lcn_bits and the now superfluous union with nr_clusters
796 from ntfs_volume structure.
797 - Use iget5_locked() and friends instead of conventional iget(). Wrap
798 the call in fs/ntfs/inode.c::ntfs_iget() and update callers of iget()
799 to use ntfs_iget(). Leave only one iget() call at mount time so we
800 don't need an ntfs_iget_mount().
801 - Change fs/ntfs/inode.c::ntfs_new_extent_inode() to take mft_no as an
802 additional argument.
803
8042.0.12 - Initial cleanup of address space operations following 2.0.11 changes.
805
806 - Merge fs/ntfs/aops.c::end_buffer_read_mst_async() and
807 fs/ntfs/aops.c::end_buffer_read_file_async() into one function
808 fs/ntfs/aops.c::end_buffer_read_attr_async() using NInoMstProtected()
809 to determine whether to apply mst fixups or not.
810 - Above change allows merging fs/ntfs/aops.c::ntfs_file_read_block()
811 and fs/ntfs/aops.c::ntfs_mst_readpage() into one function
812 fs/ntfs/aops.c::ntfs_attr_read_block(). Also, create a tiny wrapper
813 fs/ntfs/aops.c::ntfs_mst_readpage() to transform the parameters from
814 the VFS readpage function prototype to the ntfs_attr_read_block()
815 function prototype.
816
8172.0.11 - Initial preparations for fake inode based attribute i/o.
818
819 - Move definition of ntfs_inode_state_bits to fs/ntfs/inode.h and
820 do some macro magic (adapted from include/linux/buffer_head.h) to
821 expand all the helper functions NInoFoo(), NInoSetFoo(), and
822 NInoClearFoo().
823 - Add new flag to ntfs_inode_state_bits: NI_Sparse.
824 - Add new fields to ntfs_inode structure to allow use of fake inodes
825 for attribute i/o: type, name, name_len. Also add new state bits:
826 NI_Attr, which, if set, indicates the inode is a fake inode, and
827 NI_MstProtected, which, if set, indicates the attribute uses multi
828 sector transfer protection, i.e. fixups need to be applied after
829 reads and before/after writes.
830 - Rename fs/ntfs/inode.c::ntfs_{new,clear,destroy}_inode() to
831 ntfs_{new,clear,destroy}_extent_inode() and update callers.
832 - Use ntfs_clear_extent_inode() in fs/ntfs/inode.c::__ntfs_clear_inode()
833 instead of ntfs_destroy_extent_inode().
834 - Cleanup memory deallocations in {__,}ntfs_clear_{,big_}inode().
835 - Make all operations on ntfs inode state bits use the NIno* functions.
836 - Set up the new ntfs inode fields and state bits in
837 fs/ntfs/inode.c::ntfs_read_inode() and add appropriate cleanup of
838 allocated memory to __ntfs_clear_inode().
839 - Cleanup ntfs_inode structure a bit for better ordering of elements
840 w.r.t. their size to allow better packing of the structure in memory.
841
8422.0.10 - There can only be 2^32 - 1 inodes on an NTFS volume.
843
844 - Add check at mount time to verify that the number of inodes on the
845 volume does not exceed 2^32 - 1, which is the maximum allowed for
846 NTFS according to Microsoft.
847 - Change mft_no member of ntfs_inode structure to be unsigned long.
848 Update all users. This makes ntfs_inode->mft_no just a copy of struct
849 inode->i_ino. But we can't just always use struct inode->i_ino and
850 remove mft_no because extent inodes do not have an attached struct
851 inode.
852
8532.0.9 - Decompression engine now uses a single buffer and other cleanups.
854
855 - Change decompression engine to use a single buffer protected by a
856 spin lock instead of per-CPU buffers. (Rusty Russell)
857 - Do not update cb_pos when handling a partial final page during
858 decompression of a sparse compression block, as the value is later
859 reset without being read/used. (Rusty Russell)
860 - Switch to using the new KM_BIO_SRC_IRQ for atomic kmap()s. (Andrew
861 Morton)
862 - Change buffer size in ntfs_readdir()/ntfs_filldir() to use
863 NLS_MAX_CHARSET_SIZE which makes the buffers almost 1kiB each but
864 it also makes everything safer so it is a good thing.
865 - Miscellaneous minor cleanups to comments.
866
8672.0.8 - Major updates for handling of case sensitivity and dcache aliasing.
868
869 Big thanks go to Al Viro and other inhabitants of #kernel for investing
870 their time to discuss the case sensitivity and dcache aliasing issues.
871
872 - Remove unused source file fs/ntfs/attraops.c.
873 - Remove show_inodes mount option(s), thus dropping support for
874 displaying of short file names.
875 - Remove deprecated mount option posix.
876 - Restore show_sys_files mount option.
877 - Add new mount option case_sensitive, to determine if the driver
878 treats file names as case sensitive or not. If case sensitive, create
879 file names in the POSIX namespace. Otherwise create file names in the
880 LONG/WIN32 namespace. Note, files remain accessible via their short
881 file name, if it exists.
882 - Remove really dumb logic bug in boot sector recovery code.
883 - Fix dcache aliasing issues wrt short/long file names via changes
884 to fs/ntfs/dir.c::ntfs_lookup_inode_by_name() and
885 fs/ntfs/namei.c::ntfs_lookup():
886 - Add additional argument to ntfs_lookup_inode_by_name() in which we
887 return information about the matching file name if the case is not
888 matching or the match is a short file name. See comments above the
889 function definition for details.
890 - Change ntfs_lookup() to only create dcache entries for the correctly
891 cased file name and only for the WIN32 namespace counterpart of DOS
892 namespace file names. This ensures we have only one dentry per
893 directory and also removes all dcache aliasing issues between short
894 and long file names once we add write support. See comments above
895 function for details.
896 - Fix potential 1 byte overflow in fs/ntfs/unistr.c::ntfs_ucstonls().
897
8982.0.7 - Minor cleanups and updates for changes in core kernel code.
899
900 - Remove much of the NULL struct element initializers.
901 - Various updates to make compatible with recent kernels.
902 - Remove defines of MAX_BUF_PER_PAGE and include linux/buffer_head.h
903 in fs/ntfs/ntfs.h instead.
904 - Remove no longer needed KERNEL_VERSION checks. We are now in the
905 kernel proper so they are no longer needed.
906
9072.0.6 - Major bugfix to make compatible with other kernel changes.
908
909 - Initialize the mftbmp address space properly now that there are more
910 fields in the struct address_space. This was leading to hangs and
911 oopses on umount since 2.5.12 because of changes to other parts of
912 the kernel. We probably want a kernel generic init_address_space()
913 function...
914 - Drop BKL from ntfs_readdir() after consultation with Al Viro. The
915 only caller of ->readdir() is vfs_readdir() which holds i_sem during
916 the call, and i_sem is sufficient protection against changes in the
917 directory inode (including ->i_size).
918 - Use generic_file_llseek() for directories (as opposed to
919 default_llseek()) as this downs i_sem instead of the BKL which is
920 what we now need for exclusion against ->f_pos changes considering we
921 no longer take the BKL in ntfs_readdir().
922
9232.0.5 - Major bugfix. Buffer overflow in extent inode handling.
924
925 - No need to set old blocksize in super.c::ntfs_fill_super() as the
926 VFS does so via invocation of deactivate_super() calling
927 fs->fill_super() calling block_kill_super() which does it.
928 - BKL moved from VFS into dir.c::ntfs_readdir(). (Linus Torvalds)
929 -> Do we really need it? I don't think so as we have exclusion on
930 the directory ntfs_inode rw_semaphore mrec_lock. We mmight have to
931 move the ->f_pos accesses under the mrec_lock though. Check this...
932 - Fix really, really, really stupid buffer overflow in extent inode
933 handling in mft.c::map_extent_mft_record().
934
9352.0.4 - Cleanups and updates for kernel 2.5.11.
936
937 - Add documentation on how to use the MD driver to be able to use NTFS
938 stripe and volume sets in Linux and generally cleanup documentation
939 a bit.
940 Remove all uses of kdev_t in favour of struct block_device *:
941 - Change compress.c::ntfs_file_read_compressed_block() to use
942 sb_getblk() instead of getblk().
943 - Change super.c::ntfs_fill_super() to use bdev_hardsect_size() instead
944 of get_hardsect_size().
945 - No need to get old blocksize in super.c::ntfs_fill_super() as
946 fs/super.c::get_sb_bdev() already does this.
947 - Set bh->b_bdev instead of bh->b_dev throughout aops.c.
948
9492.0.3 - Small bug fixes, cleanups, and performance improvements.
950
951 - Remove some dead code from mft.c.
952 - Optimize readpage and read_block functions throughout aops.c so that
953 only initialized blocks are read. Non-initialized ones have their
954 buffer head mapped, zeroed, and set up to date, without scheduling
955 any i/o. Thanks to Al Viro for advice on how to avoid the device i/o.
956 Thanks go to Andrew Morton for spotting the below:
957 - Fix buglet in allocate_compression_buffers() error code path.
958 - Call flush_dcache_page() after modifying page cache page contents in
959 ntfs_file_readpage().
960 - Check for existence of page buffers throughout aops.c before calling
961 create_empty_buffers(). This happens when an I/O error occurs and the
962 read is retried. (It also happens once writing is implemented so that
963 needed doing anyway but I had left it for later...)
964 - Don't BUG_ON() uptodate and/or mapped buffers throughout aops.c in
965 readpage and read_block functions. Reasoning same as above (i.e. I/O
966 error retries and future write code paths.)
967
9682.0.2 - Minor updates and cleanups.
969
970 - Cleanup: rename mst.c::__post_read_mst_fixup to post_write_mst_fixup
971 and cleanup the code a bit, removing the unused size parameter.
972 - Change default fmask to 0177 and update documentation.
973 - Change attrib.c::get_attr_search_ctx() to return the search context
974 directly instead of taking the address of a pointer. A return value
975 of NULL means the allocation failed. Updated all callers
976 appropriately.
977 - Update to 2.5.9 kernel (preserving backwards compatibility) by
978 replacing all occurences of page->buffers with page_buffers(page).
979 - Fix minor bugs in runlist merging, also minor cleanup.
980 - Updates to bootsector layout and mft mirror contents descriptions.
981 - Small bug fix in error detection in unistr.c and some cleanups.
982 - Grow name buffer allocations in unistr.c in aligned mutlipled of 64
983 bytes.
984
9852.0.1 - Minor updates.
986
987 - Make default umask correspond to documentation.
988 - Improve documentation.
989 - Set default mode to include execute bit. The {u,f,d}mask can be used
990 to take it away if desired. This allows binaries to be executed from
991 a mounted ntfs partition.
992
9932.0.0 - New version number. Remove TNG from the name. Now in the kernel.
994
995 - Add kill_super, just keeping up with the vfs changes in the kernel.
996 - Repeat some changes from tng-0.0.8 that somehow got lost on the way
997 from the CVS import into BitKeeper.
998 - Begin to implement proper handling of allocated_size vs
999 initialized_size vs data_size (i.e. i_size). Done are
1000 mft.c::ntfs_mft_readpage(), aops.c::end_buffer_read_index_async(),
1001 and attrib.c::load_attribute_list().
1002 - Lock the runlist in attrib.c::load_attribute_list() while using it.
1003 - Fix memory leak in ntfs_file_read_compressed_block() and generally
1004 clean up compress.c a little, removing some uncommented/unused debug
1005 code.
1006 - Tidy up dir.c a little bit.
1007 - Don't bother getting the runlist in inode.c::ntfs_read_inode().
1008 - Merge mft.c::ntfs_mft_readpage() and aops.c::ntfs_index_readpage()
1009 creating aops.c::ntfs_mst_readpage(), improving the handling of
1010 holes and overflow in the process and implementing the correct
1011 equivalent of ntfs_file_get_block() in ntfs_mst_readpage() itself.
1012 I am aiming for correctness at the moment. Modularisation can come
1013 later.
1014 - Rename aops.c::end_buffer_read_index_async() to
1015 end_buffer_read_mst_async() and optimize the overflow checking and
1016 handling.
1017 - Use the host of the mftbmp address space mapping to hold the ntfs
1018 volume. This is needed so the async i/o completion handler can
1019 retrieve a pointer to the volume. Hopefully this will not cause
1020 problems elsewhere in the kernel... Otherwise will need to use a
1021 fake inode.
1022 - Complete implementation of proper handling of allocated_size vs
1023 initialized_size vs data_size (i.e. i_size) in whole driver.
1024 Basically aops.c is now completely rewritten.
1025 - Change NTFS driver name to just NTFS and set version number to 2.0.0
1026 to make a clear distinction from the old driver which is still on
1027 version 1.1.22.
1028
1029tng-0.0.8 - 08/03/2002 - Now using BitKeeper, http://linux-ntfs.bkbits.net/
1030
1031 - Replace bdevname(sb->s_dev) with sb->s_id.
1032 - Remove now superfluous new-line characters in all callers of
1033 ntfs_debug().
1034 - Apply kludge in ntfs_read_inode(), setting i_nlink to 1 for
1035 directories. Without this the "find" utility gets very upset which is
1036 fair enough as Linux/Unix do not support directory hard links.
1037 - Further runlist merging work. (Richard Russon)
1038 - Backwards compatibility for gcc-2.95. (Richard Russon)
1039 - Update to kernel 2.5.5-pre1 and rediff the now tiny patch.
1040 - Convert to new file system declaration using ->ntfs_get_sb() and
1041 replacing ntfs_read_super() with ntfs_fill_super().
1042 - Set s_maxbytes to MAX_LFS_FILESIZE to avoid page cache page index
1043 overflow on 32-bit architectures.
1044 - Cleanup upcase loading code to use ntfs_(un)map_page().
1045 - Disable/reenable preemtion in critical sections of compession engine.
1046 - Replace device size determination in ntfs_fill_super() with
1047 sb->s_bdev->bd_inode->i_size (in bytes) and remove now superfluous
1048 function super.c::get_nr_blocks().
1049 - Implement a mount time option (show_inodes) allowing choice of which
1050 types of inode names readdir() returns and modify ntfs_filldir()
1051 accordingly. There are several parameters to show_inodes:
1052 system: system files
1053 win32: long file names (including POSIX file names) [DEFAULT]
1054 long: same as win32
1055 dos: short file names only (excluding POSIX file names)
1056 short: same as dos
1057 posix: same as both win32 and dos
1058 all: all file names
1059 Note that the options are additive, i.e. specifying:
1060 -o show_inodes=system,show_inodes=win32,show_inodes=dos
1061 is the same as specifying:
1062 -o show_inodes=all
1063 Note that the "posix" and "all" options will show all directory
1064 names, BUT the link count on each directory inode entry is set to 1,
1065 due to Linux not supporting directory hard links. This may well
1066 confuse some userspace applications, since the directory names will
1067 have the same inode numbers. Thus it is NOT advisable to use the
1068 "posix" or "all" options. We provide them only for completeness sake.
1069 - Add copies of allocated_size, initialized_size, and compressed_size to
1070 the ntfs inode structure and set them up in
1071 inode.c::ntfs_read_inode(). These reflect the unnamed data attribute
1072 for files and the index allocation attribute for directories.
1073 - Add copies of allocated_size and initialized_size to ntfs inode for
1074 $BITMAP attribute of large directories and set them up in
1075 inode.c::ntfs_read_inode().
1076 - Add copies of allocated_size and initialized_size to ntfs volume for
1077 $BITMAP attribute of $MFT and set them up in
1078 super.c::load_system_files().
1079 - Parse deprecated ntfs driver options (iocharset, show_sys_files,
1080 posix, and utf8) and tell user what the new options to use are. Note
1081 we still do support them but they will be removed with kernel 2.7.x.
1082 - Change all occurences of integer long long printf formatting to hex
1083 as printk() will not support long long integer format if/when the
1084 div64 patch goes into the kernel.
1085 - Make slab caches have stable names and change the names to what they
1086 were intended to be. These changes are required/made possible by the
1087 new slab cache name handling which removes the length limitation by
1088 requiring the caller of kmem_cache_create() to supply a stable name
1089 which is then referenced but not copied.
1090 - Rename run_list structure to run_list_element and create a new
1091 run_list structure containing a pointer to a run_list_element
1092 structure and a read/write semaphore. Adapt all users of runlists
1093 to new scheme and take and release the lock as needed. This fixes a
1094 nasty race as the run_list changes even when inodes are locked for
1095 reading and even when the inode isn't locked at all, so we really
1096 needed the serialization. We use a semaphore rather than a spinlock
1097 as memory allocations can sleep and doing everything GFP_ATOMIC
1098 would be silly.
1099 - Cleanup read_inode() removing all code checking for lowest_vcn != 0.
1100 This can never happen due to the nature of lookup_attr() and how we
1101 support attribute lists. If it did happen it would imply the inode
1102 being corrupt.
1103 - Check for lowest_vcn != 0 in ntfs_read_inode() and mark the inode as
1104 bad if found.
1105 - Update to 2.5.6-pre2 changes in struct address_space.
1106 - Use parent_ino() when accessing d_parent inode number in dir.c.
1107 - Import Sourceforge CVS repository into BitKeeper repository:
1108 http://linux-ntfs.bkbits.net/ntfs-tng-2.5
1109 - Update fs/Makefile, fs/Config.help, fs/Config.in, and
1110 Documentation/filesystems/ntfs.txt for NTFS TNG.
1111 - Create kernel configuration option controlling whether debugging
1112 is enabled or not.
1113 - Add the required export of end_buffer_io_sync() from the patches
1114 directory to the kernel code.
1115 - Update inode.c::ntfs_show_options() with show_inodes mount option.
1116 - Update errors mount option.
1117
1118tng-0.0.7 - 13/02/2002 - The driver is now feature complete for read-only!
1119
1120 - Cleanup mft.c and it's debug/error output in particular. Fix a minor
1121 bug in mapping of extent inodes. Update all the comments to fit all
1122 the recent code changes.
1123 - Modify vcn_to_lcn() to cope with entirely unmapped runlists.
1124 - Cleanups in compress.c, mostly comments and folding help.
1125 - Implement attrib.c::map_run_list() as a generic helper.
1126 - Make compress.c::ntfs_file_read_compressed_block() use map_run_list()
1127 thus making code shorter and enabling attribute list support.
1128 - Cleanup incorrect use of [su]64 with %L printf format specifier in
1129 all source files. Type casts to [unsigned] long long added to correct
1130 the mismatches (important for architectures which have long long not
1131 being 64 bits).
1132 - Merge async io completion handlers for directory indexes and $MFT
1133 data into one by setting the index_block_size{_bits} of the ntfs
1134 inode for $MFT to the mft_record_size{_bits} of the ntfs_volume.
1135 - Cleanup aops.c, update comments.
1136 - Make ntfs_file_get_block() use map_run_list() so all files now
1137 support attribute lists.
1138 - Make ntfs_dir_readpage() almost verbatim copy of
1139 block_read_full_page() by using ntfs_file_get_block() with only real
1140 difference being the use of our own async io completion handler
1141 rather than the default one, thus reducing the amount of code and
1142 automatically enabling attribute list support for directory indices.
1143 - Fix bug in load_attribute_list() - forgot to call brelse in error
1144 code path.
1145 - Change parameters to find_attr() and lookup_attr(). We no longer
1146 pass in the upcase table and its length. These can be gotten from
1147 ctx->ntfs_ino->vol->upcase{_len}. Update all callers.
1148 - Cleanups in attrib.c.
1149 - Implement merging of runlists, attrib.c::merge_run_lists() and its
1150 helpers. (Richard Russon)
1151 - Attribute lists part 2, attribute extents and multi part runlists:
1152 enable proper support for LCN_RL_NOT_MAPPED and automatic mapping of
1153 further runlist parts via attrib.c::map_run_list().
1154 - Tiny endianness bug fix in decompress_mapping_pairs().
1155
1156tng-0.0.6 - Encrypted directories, bug fixes, cleanups, debugging enhancements.
1157
1158 - Enable encrypted directories. (Their index root is marked encrypted
1159 to indicate that new files in that directory should be created
1160 encrypted.)
1161 - Fix bug in NInoBmpNonResident() macro. (Cut and paste error.)
1162 - Enable $Extend system directory. Most (if not all) extended system
1163 files do not have unnamed data attributes so ntfs_read_inode() had to
1164 special case them but that is ok, as the special casing recovery
1165 happens inside an error code path so there is zero slow down in the
1166 normal fast path. The special casing is done by introducing a new
1167 function inode.c::ntfs_is_extended_system_file() which checks if any
1168 of the hard links in the inode point to $Extend as being their parent
1169 directory and if they do we assume this is an extended system file.
1170 - Create a sysctl/proc interface to allow {dis,en}abling of debug output
1171 when compiled with -DDEBUG. Default is debug messages to be disabled.
1172 To enable them, one writes a non-zero value to /proc/sys/fs/ntfs-debug
1173 (if /proc is enabled) or uses sysctl(2) to effect the same (if sysctl
1174 interface is enabled). Inspired by old ntfs driver.
1175 - Add debug_msgs insmod/kernel boot parameter to set whether debug
1176 messages are {dis,en}abled. This is useful to enable debug messages
1177 during ntfs initialization and is the only way to activate debugging
1178 when the sysctl interface is not enabled.
1179 - Cleanup debug output in various places.
1180 - Remove all dollar signs ($) from the source (except comments) to
1181 enable compilation on architectures whose gcc compiler does not
1182 support dollar signs in the names of variables/constants. Attribute
1183 types now start with AT_ instead of $ and $I30 is now just I30.
1184 - Cleanup ntfs_lookup() and add consistency check of sequence numbers.
1185 - Load complete runlist for $MFT/$BITMAP during mount and cleanup
1186 access functions. This means we now cope with $MFT/$BITMAP being
1187 spread accross several mft records.
1188 - Disable modification of mft_zone_multiplier on remount. We can always
1189 reenable this later on if we really want to, but we will need to make
1190 sure we readjust the mft_zone size / layout accordingly.
1191
1192tng-0.0.5 - Modernize for 2.5.x and further in line-ing with Al Viro's comments.
1193
1194 - Use sb_set_blocksize() instead of set_blocksize() and verify the
1195 return value.
1196 - Use sb_bread() instead of bread() throughout.
1197 - Add index_vcn_size{_bits} to ntfs_inode structure to store the size
1198 of a directory index block vcn. Apply resulting simplifications in
1199 dir.c everywhere.
1200 - Fix a small bug somewhere (but forgot what it was).
1201 - Change ntfs_{debug,error,warning} to enable gcc to do type checking
1202 on the printf-format parameter list and fix bugs reported by gcc
1203 as a result. (Richard Russon)
1204 - Move inode allocation strategy to Al's new stuff but maintain the
1205 divorce of ntfs_inode from struct inode. To achieve this we have two
1206 separate slab caches, one for big ntfs inodes containing a struct
1207 inode and pure ntfs inodes and at the same time fix some faulty
1208 error code paths in ntfs_read_inode().
1209 - Show mount options in proc (inode.c::ntfs_show_options()).
1210
1211tng-0.0.4 - Big changes, getting in line with Al Viro's comments.
1212
1213 - Modified (un)map_mft_record functions to be common for read and write
1214 case. To specify which is which, added extra parameter at front of
1215 parameter list. Pass either READ or WRITE to this, each has the
1216 obvious meaning.
1217 - General cleanups to allow for easier folding in vi.
1218 - attrib.c::decompress_mapping_pairs() now accepts the old runlist
1219 argument, and invokes attrib.c::merge_run_lists() to merge the old
1220 and the new runlists.
1221 - Removed attrib.c::find_first_attr().
1222 - Implemented loading of attribute list and complete runlist for $MFT.
1223 This means we now cope with $MFT being spread across several mft
1224 records.
1225 - Adapt to 2.5.2-pre9 and the changed create_empty_buffers() syntax.
1226 - Adapt major/minor/kdev_t/[bk]devname stuff to new 2.5.x kernels.
1227 - Make ntfs_volume be allocated via kmalloc() instead of using a slab
1228 cache. There are too little ntfs_volume structures at any one time
1229 to justify a private slab cache.
1230 - Fix bogus kmap() use in async io completion. Now use kmap_atomic().
1231 Use KM_BIO_IRQ on advice from IRC/kernel...
1232 - Use ntfs_map_page() in map_mft_record() and create ->readpage method
1233 for reading $MFT (ntfs_mft_readpage). In the process create dedicated
1234 address space operations (ntfs_mft_aops) for $MFT inode mapping. Also
1235 removed the now superfluous exports from the kernel core patch.
1236 - Fix a bug where kfree() was used insted of ntfs_free().
1237 - Change map_mft_record() to take ntfs_inode as argument instead of
1238 vfs inode. Dito for unmap_mft_record(). Adapt all callers.
1239 - Add pointer to ntfs_volume to ntfs_inode.
1240 - Add mft record number and sequence number to ntfs_inode. Stop using
1241 i_ino and i_generation for in-driver purposes.
1242 - Implement attrib.c::merge_run_lists(). (Richard Russon)
1243 - Remove use of proper inodes by extent inodes. Move i_ino and
1244 i_generation to ntfs_inode to do this. Apply simplifications that
1245 result and remove iget_no_wait(), etc.
1246 - Pass ntfs_inode everywhere in the driver (used to be struct inode).
1247 - Add reference counting in ntfs_inode for the ntfs inode itself and
1248 for the mapped mft record.
1249 - Extend mft record mapping so we can (un)map extent mft records (new
1250 functions (un)map_extent_mft_record), and so mappings are reference
1251 counted and don't have to happen twice if already mapped - just ref
1252 count increases.
1253 - Add -o iocharset as alias to -o nls for backwards compatibility.
1254 - The latest core patch is now tiny. In fact just a single additional
1255 export is necessary over the base kernel.
1256
1257tng-0.0.3 - Cleanups, enhancements, bug fixes.
1258
1259 - Work on attrib.c::decompress_mapping_pairs() to detect base extents
1260 and setup the runlist appropriately using knowledge provided by the
1261 sizes in the base attribute record.
1262 - Balance the get_/put_attr_search_ctx() calls so we don't leak memory
1263 any more.
1264 - Introduce ntfs_malloc_nofs() and ntfs_free() to allocate/free a single
1265 page or use vmalloc depending on the amount of memory requested.
1266 - Cleanup error output. The __FUNCTION__ "(): " is now added
1267 automatically. Introduced a new header file debug.h to support this
1268 and also moved ntfs_debug() function into it.
1269 - Make reading of compressed files more intelligent and especially get
1270 rid of the vmalloc_nofs() from readpage(). This now uses per CPU
1271 buffers (allocated at first mount with cluster size <= 4kiB and
1272 deallocated on last umount with cluster size <= 4kiB), and
1273 asynchronous io for the compressed data using a list of buffer heads.
1274 Er, we use synchronous io as async io only works on whole pages
1275 covered by buffers and not on individual buffer heads...
1276 - Bug fix for reading compressed files with sparse compression blocks.
1277
1278tng-0.0.2 - Now handles larger/fragmented/compressed volumes/files/dirs.
1279
1280 - Fixed handling of directories when cluster size exceeds index block
1281 size.
1282 - Hide DOS only name space directory entries from readdir() but allow
1283 them in lookup(). This should fix the problem that Linux doesn't
1284 support directory hard links, while still allowing access to entries
1285 via their short file name. This also has the benefit of mimicking
1286 what Windows users are used to, so it is the ideal solution.
1287 - Implemented sync_page everywhere so no more hangs in D state when
1288 waiting for a page.
1289 - Stop using bforget() in favour of brelse().
1290 - Stop locking buffers unnecessarily.
1291 - Implemented compressed files (inode->mapping contains uncompressed
1292 data, raw compressed data is currently bread() into a vmalloc()ed
1293 memory buffer).
1294 - Enable compressed directories. (Their index root is marked compressed
1295 to indicate that new files in that directory should be created
1296 compressed.)
1297 - Use vsnprintf rather than vsprintf in the ntfs_error and ntfs_warning
1298 functions. (Thanks to Will Dyson for pointing this out.)
1299 - Moved the ntfs_inode and ntfs_volume (the former ntfs_inode_info and
1300 ntfs_sb_info) out of the common inode and super_block structures and
1301 started using the generic_ip and generic_sbp pointers instead. This
1302 makes ntfs entirely private with respect to the kernel tree.
1303 - Detect compiler version and abort with error message if gcc less than
1304 2.96 is used.
1305 - Fix bug in name comparison function in unistr.c.
1306 - Implement attribute lists part 1, the infrastructure: search contexts
1307 and operations, find_external_attr(), lookup_attr()) and make the
1308 code use the infrastructure.
1309 - Fix stupid buffer overflow bug that became apparent on larger run
1310 list containing attributes.
1311 - Fix bugs in readdir() that became apparent on larger directories.
1312
1313 The driver is now really useful and survives the test
1314 find . -type f -exec md5sum "{}" \;
1315 without any error messages on a over 1GiB sized partition with >16k
1316 files on it, including compressed files and directories and many files
1317 and directories with attribute lists.
1318
1319tng-0.0.1 - The first useful version.
1320
1321 - Added ntfs_lookup().
1322 - Added default upcase generation and handling.
1323 - Added compile options to be shown on module init.
1324 - Many bug fixes that were "hidden" before.
1325 - Update to latest kernel.
1326 - Added ntfs_readdir().
1327 - Added file operations for mmap(), read(), open() and llseek(). We just
1328 use the generic ones. The whole point of going through implementing
1329 readpage() methods and where possible get_block() call backs is that
1330 this allows us to make use of the generic high level methods provided
1331 by the kernel.
1332
1333 The driver is now actually useful! Yey. (-: It undoubtedly has got bugs
1334 though and it doesn't implement accesssing compressed files yet. Also,
1335 accessing files with attribute list attributes is not implemented yet
1336 either. But for small or simple file systems it should work and allow
1337 you to list directories, use stat on directory entries and the file
1338 system, open, read, mmap and llseek around in files. A big mile stone
1339 has been reached!
1340
1341tng-0.0.0 - Initial version tag.
1342
1343 Initial driver implementation. The driver can mount and umount simple
1344 NTFS file systems (i.e. ones without attribute lists in the system
1345 files). If the mount fails there might be problems in the error handling
1346 code paths, so be warned. Otherwise it seems to be loading the system
1347 files nicely and the mft record read mapping/unmapping seems to be
1348 working nicely, too. Proof of inode metadata in the page cache and non-
1349 resident file unnamed stream data in the page cache concepts is thus
1350 complete.
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
new file mode 100644
index 000000000000..7b66381a0b0f
--- /dev/null
+++ b/fs/ntfs/Makefile
@@ -0,0 +1,19 @@
1# Rules for making the NTFS driver.
2
3obj-$(CONFIG_NTFS_FS) += ntfs.o
4
5ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
7 unistr.o upcase.o
8
9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.22\"
10
11ifeq ($(CONFIG_NTFS_DEBUG),y)
12EXTRA_CFLAGS += -DDEBUG
13endif
14
15ifeq ($(CONFIG_NTFS_RW),y)
16EXTRA_CFLAGS += -DNTFS_RW
17
18ntfs-objs += bitmap.o lcnalloc.o logfile.o quota.o
19endif
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
new file mode 100644
index 000000000000..45d56e41ed98
--- /dev/null
+++ b/fs/ntfs/aops.c
@@ -0,0 +1,2324 @@
1/**
2 * aops.c - NTFS kernel address space operations and page cache handling.
3 * Part of the Linux-NTFS project.
4 *
5 * Copyright (c) 2001-2004 Anton Altaparmakov
6 * Copyright (c) 2002 Richard Russon
7 *
8 * This program/include file is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as published
10 * by the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program/include file is distributed in the hope that it will be
14 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program (in the main directory of the Linux-NTFS
20 * distribution in the file COPYING); if not, write to the Free Software
21 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24#include <linux/errno.h>
25#include <linux/mm.h>
26#include <linux/pagemap.h>
27#include <linux/swap.h>
28#include <linux/buffer_head.h>
29#include <linux/writeback.h>
30
31#include "aops.h"
32#include "attrib.h"
33#include "debug.h"
34#include "inode.h"
35#include "mft.h"
36#include "runlist.h"
37#include "types.h"
38#include "ntfs.h"
39
40/**
41 * ntfs_end_buffer_async_read - async io completion for reading attributes
42 * @bh: buffer head on which io is completed
43 * @uptodate: whether @bh is now uptodate or not
44 *
45 * Asynchronous I/O completion handler for reading pages belonging to the
46 * attribute address space of an inode. The inodes can either be files or
47 * directories or they can be fake inodes describing some attribute.
48 *
49 * If NInoMstProtected(), perform the post read mst fixups when all IO on the
50 * page has been completed and mark the page uptodate or set the error bit on
51 * the page. To determine the size of the records that need fixing up, we
52 * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs
53 * record size, and index_block_size_bits, to the log(base 2) of the ntfs
54 * record size.
55 */
56static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
57{
58 static DEFINE_SPINLOCK(page_uptodate_lock);
59 unsigned long flags;
60 struct buffer_head *tmp;
61 struct page *page;
62 ntfs_inode *ni;
63 int page_uptodate = 1;
64
65 page = bh->b_page;
66 ni = NTFS_I(page->mapping->host);
67
68 if (likely(uptodate)) {
69 s64 file_ofs;
70
71 set_buffer_uptodate(bh);
72
73 file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) +
74 bh_offset(bh);
75 /* Check for the current buffer head overflowing. */
76 if (file_ofs + bh->b_size > ni->initialized_size) {
77 char *addr;
78 int ofs = 0;
79
80 if (file_ofs < ni->initialized_size)
81 ofs = ni->initialized_size - file_ofs;
82 addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
83 memset(addr + bh_offset(bh) + ofs, 0, bh->b_size - ofs);
84 flush_dcache_page(page);
85 kunmap_atomic(addr, KM_BIO_SRC_IRQ);
86 }
87 } else {
88 clear_buffer_uptodate(bh);
89 ntfs_error(ni->vol->sb, "Buffer I/O error, logical block %llu.",
90 (unsigned long long)bh->b_blocknr);
91 SetPageError(page);
92 }
93 spin_lock_irqsave(&page_uptodate_lock, flags);
94 clear_buffer_async_read(bh);
95 unlock_buffer(bh);
96 tmp = bh;
97 do {
98 if (!buffer_uptodate(tmp))
99 page_uptodate = 0;
100 if (buffer_async_read(tmp)) {
101 if (likely(buffer_locked(tmp)))
102 goto still_busy;
103 /* Async buffers must be locked. */
104 BUG();
105 }
106 tmp = tmp->b_this_page;
107 } while (tmp != bh);
108 spin_unlock_irqrestore(&page_uptodate_lock, flags);
109 /*
110 * If none of the buffers had errors then we can set the page uptodate,
111 * but we first have to perform the post read mst fixups, if the
112 * attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
113 * Note we ignore fixup errors as those are detected when
114 * map_mft_record() is called which gives us per record granularity
115 * rather than per page granularity.
116 */
117 if (!NInoMstProtected(ni)) {
118 if (likely(page_uptodate && !PageError(page)))
119 SetPageUptodate(page);
120 } else {
121 char *addr;
122 unsigned int i, recs;
123 u32 rec_size;
124
125 rec_size = ni->itype.index.block_size;
126 recs = PAGE_CACHE_SIZE / rec_size;
127 /* Should have been verified before we got here... */
128 BUG_ON(!recs);
129 addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
130 for (i = 0; i < recs; i++)
131 post_read_mst_fixup((NTFS_RECORD*)(addr +
132 i * rec_size), rec_size);
133 flush_dcache_page(page);
134 kunmap_atomic(addr, KM_BIO_SRC_IRQ);
135 if (likely(!PageError(page) && page_uptodate))
136 SetPageUptodate(page);
137 }
138 unlock_page(page);
139 return;
140still_busy:
141 spin_unlock_irqrestore(&page_uptodate_lock, flags);
142 return;
143}
144
145/**
146 * ntfs_read_block - fill a @page of an address space with data
147 * @page: page cache page to fill with data
148 *
149 * Fill the page @page of the address space belonging to the @page->host inode.
150 * We read each buffer asynchronously and when all buffers are read in, our io
151 * completion handler ntfs_end_buffer_read_async(), if required, automatically
152 * applies the mst fixups to the page before finally marking it uptodate and
153 * unlocking it.
154 *
155 * We only enforce allocated_size limit because i_size is checked for in
156 * generic_file_read().
157 *
158 * Return 0 on success and -errno on error.
159 *
160 * Contains an adapted version of fs/buffer.c::block_read_full_page().
161 */
162static int ntfs_read_block(struct page *page)
163{
164 VCN vcn;
165 LCN lcn;
166 ntfs_inode *ni;
167 ntfs_volume *vol;
168 runlist_element *rl;
169 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
170 sector_t iblock, lblock, zblock;
171 unsigned int blocksize, vcn_ofs;
172 int i, nr;
173 unsigned char blocksize_bits;
174
175 ni = NTFS_I(page->mapping->host);
176 vol = ni->vol;
177
178 /* $MFT/$DATA must have its complete runlist in memory at all times. */
179 BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
180
181 blocksize_bits = VFS_I(ni)->i_blkbits;
182 blocksize = 1 << blocksize_bits;
183
184 if (!page_has_buffers(page))
185 create_empty_buffers(page, blocksize, 0);
186 bh = head = page_buffers(page);
187 if (unlikely(!bh)) {
188 unlock_page(page);
189 return -ENOMEM;
190 }
191
192 iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
193 lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
194 zblock = (ni->initialized_size + blocksize - 1) >> blocksize_bits;
195
196 /* Loop through all the buffers in the page. */
197 rl = NULL;
198 nr = i = 0;
199 do {
200 u8 *kaddr;
201
202 if (unlikely(buffer_uptodate(bh)))
203 continue;
204 if (unlikely(buffer_mapped(bh))) {
205 arr[nr++] = bh;
206 continue;
207 }
208 bh->b_bdev = vol->sb->s_bdev;
209 /* Is the block within the allowed limits? */
210 if (iblock < lblock) {
211 BOOL is_retry = FALSE;
212
213 /* Convert iblock into corresponding vcn and offset. */
214 vcn = (VCN)iblock << blocksize_bits >>
215 vol->cluster_size_bits;
216 vcn_ofs = ((VCN)iblock << blocksize_bits) &
217 vol->cluster_size_mask;
218 if (!rl) {
219lock_retry_remap:
220 down_read(&ni->runlist.lock);
221 rl = ni->runlist.rl;
222 }
223 if (likely(rl != NULL)) {
224 /* Seek to element containing target vcn. */
225 while (rl->length && rl[1].vcn <= vcn)
226 rl++;
227 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
228 } else
229 lcn = LCN_RL_NOT_MAPPED;
230 /* Successful remap. */
231 if (lcn >= 0) {
232 /* Setup buffer head to correct block. */
233 bh->b_blocknr = ((lcn << vol->cluster_size_bits)
234 + vcn_ofs) >> blocksize_bits;
235 set_buffer_mapped(bh);
236 /* Only read initialized data blocks. */
237 if (iblock < zblock) {
238 arr[nr++] = bh;
239 continue;
240 }
241 /* Fully non-initialized data block, zero it. */
242 goto handle_zblock;
243 }
244 /* It is a hole, need to zero it. */
245 if (lcn == LCN_HOLE)
246 goto handle_hole;
247 /* If first try and runlist unmapped, map and retry. */
248 if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
249 int err;
250 is_retry = TRUE;
251 /*
252 * Attempt to map runlist, dropping lock for
253 * the duration.
254 */
255 up_read(&ni->runlist.lock);
256 err = ntfs_map_runlist(ni, vcn);
257 if (likely(!err))
258 goto lock_retry_remap;
259 rl = NULL;
260 lcn = err;
261 }
262 /* Hard error, zero out region. */
263 bh->b_blocknr = -1;
264 SetPageError(page);
265 ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
266 "attribute type 0x%x, vcn 0x%llx, "
267 "offset 0x%x because its location on "
268 "disk could not be determined%s "
269 "(error code %lli).", ni->mft_no,
270 ni->type, (unsigned long long)vcn,
271 vcn_ofs, is_retry ? " even after "
272 "retrying" : "", (long long)lcn);
273 }
274 /*
275 * Either iblock was outside lblock limits or
276 * ntfs_rl_vcn_to_lcn() returned error. Just zero that portion
277 * of the page and set the buffer uptodate.
278 */
279handle_hole:
280 bh->b_blocknr = -1UL;
281 clear_buffer_mapped(bh);
282handle_zblock:
283 kaddr = kmap_atomic(page, KM_USER0);
284 memset(kaddr + i * blocksize, 0, blocksize);
285 flush_dcache_page(page);
286 kunmap_atomic(kaddr, KM_USER0);
287 set_buffer_uptodate(bh);
288 } while (i++, iblock++, (bh = bh->b_this_page) != head);
289
290 /* Release the lock if we took it. */
291 if (rl)
292 up_read(&ni->runlist.lock);
293
294 /* Check we have at least one buffer ready for i/o. */
295 if (nr) {
296 struct buffer_head *tbh;
297
298 /* Lock the buffers. */
299 for (i = 0; i < nr; i++) {
300 tbh = arr[i];
301 lock_buffer(tbh);
302 tbh->b_end_io = ntfs_end_buffer_async_read;
303 set_buffer_async_read(tbh);
304 }
305 /* Finally, start i/o on the buffers. */
306 for (i = 0; i < nr; i++) {
307 tbh = arr[i];
308 if (likely(!buffer_uptodate(tbh)))
309 submit_bh(READ, tbh);
310 else
311 ntfs_end_buffer_async_read(tbh, 1);
312 }
313 return 0;
314 }
315 /* No i/o was scheduled on any of the buffers. */
316 if (likely(!PageError(page)))
317 SetPageUptodate(page);
318 else /* Signal synchronous i/o error. */
319 nr = -EIO;
320 unlock_page(page);
321 return nr;
322}
323
324/**
325 * ntfs_readpage - fill a @page of a @file with data from the device
326 * @file: open file to which the page @page belongs or NULL
327 * @page: page cache page to fill with data
328 *
329 * For non-resident attributes, ntfs_readpage() fills the @page of the open
330 * file @file by calling the ntfs version of the generic block_read_full_page()
331 * function, ntfs_read_block(), which in turn creates and reads in the buffers
332 * associated with the page asynchronously.
333 *
334 * For resident attributes, OTOH, ntfs_readpage() fills @page by copying the
335 * data from the mft record (which at this stage is most likely in memory) and
336 * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
337 * even if the mft record is not cached at this point in time, we need to wait
338 * for it to be read in before we can do the copy.
339 *
340 * Return 0 on success and -errno on error.
341 */
342static int ntfs_readpage(struct file *file, struct page *page)
343{
344 loff_t i_size;
345 ntfs_inode *ni, *base_ni;
346 u8 *kaddr;
347 ntfs_attr_search_ctx *ctx;
348 MFT_RECORD *mrec;
349 u32 attr_len;
350 int err = 0;
351
352 BUG_ON(!PageLocked(page));
353 /*
354 * This can potentially happen because we clear PageUptodate() during
355 * ntfs_writepage() of MstProtected() attributes.
356 */
357 if (PageUptodate(page)) {
358 unlock_page(page);
359 return 0;
360 }
361 ni = NTFS_I(page->mapping->host);
362
363 /* NInoNonResident() == NInoIndexAllocPresent() */
364 if (NInoNonResident(ni)) {
365 /*
366 * Only unnamed $DATA attributes can be compressed or
367 * encrypted.
368 */
369 if (ni->type == AT_DATA && !ni->name_len) {
370 /* If file is encrypted, deny access, just like NT4. */
371 if (NInoEncrypted(ni)) {
372 err = -EACCES;
373 goto err_out;
374 }
375 /* Compressed data streams are handled in compress.c. */
376 if (NInoCompressed(ni))
377 return ntfs_read_compressed_block(page);
378 }
379 /* Normal data stream. */
380 return ntfs_read_block(page);
381 }
382 /*
383 * Attribute is resident, implying it is not compressed or encrypted.
384 * This also means the attribute is smaller than an mft record and
385 * hence smaller than a page, so can simply zero out any pages with
386 * index above 0. We can also do this if the file size is 0.
387 */
388 if (unlikely(page->index > 0 || !i_size_read(VFS_I(ni)))) {
389 kaddr = kmap_atomic(page, KM_USER0);
390 memset(kaddr, 0, PAGE_CACHE_SIZE);
391 flush_dcache_page(page);
392 kunmap_atomic(kaddr, KM_USER0);
393 goto done;
394 }
395 if (!NInoAttr(ni))
396 base_ni = ni;
397 else
398 base_ni = ni->ext.base_ntfs_ino;
399 /* Map, pin, and lock the mft record. */
400 mrec = map_mft_record(base_ni);
401 if (IS_ERR(mrec)) {
402 err = PTR_ERR(mrec);
403 goto err_out;
404 }
405 ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
406 if (unlikely(!ctx)) {
407 err = -ENOMEM;
408 goto unm_err_out;
409 }
410 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
411 CASE_SENSITIVE, 0, NULL, 0, ctx);
412 if (unlikely(err))
413 goto put_unm_err_out;
414 attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
415 i_size = i_size_read(VFS_I(ni));
416 if (unlikely(attr_len > i_size))
417 attr_len = i_size;
418 kaddr = kmap_atomic(page, KM_USER0);
419 /* Copy the data to the page. */
420 memcpy(kaddr, (u8*)ctx->attr +
421 le16_to_cpu(ctx->attr->data.resident.value_offset),
422 attr_len);
423 /* Zero the remainder of the page. */
424 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
425 flush_dcache_page(page);
426 kunmap_atomic(kaddr, KM_USER0);
427put_unm_err_out:
428 ntfs_attr_put_search_ctx(ctx);
429unm_err_out:
430 unmap_mft_record(base_ni);
431done:
432 SetPageUptodate(page);
433err_out:
434 unlock_page(page);
435 return err;
436}
437
438#ifdef NTFS_RW
439
440/**
441 * ntfs_write_block - write a @page to the backing store
442 * @page: page cache page to write out
443 * @wbc: writeback control structure
444 *
445 * This function is for writing pages belonging to non-resident, non-mst
446 * protected attributes to their backing store.
447 *
448 * For a page with buffers, map and write the dirty buffers asynchronously
449 * under page writeback. For a page without buffers, create buffers for the
450 * page, then proceed as above.
451 *
452 * If a page doesn't have buffers the page dirty state is definitive. If a page
453 * does have buffers, the page dirty state is just a hint, and the buffer dirty
454 * state is definitive. (A hint which has rules: dirty buffers against a clean
455 * page is illegal. Other combinations are legal and need to be handled. In
456 * particular a dirty page containing clean buffers for example.)
457 *
458 * Return 0 on success and -errno on error.
459 *
460 * Based on ntfs_read_block() and __block_write_full_page().
461 */
462static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
463{
464 VCN vcn;
465 LCN lcn;
466 sector_t block, dblock, iblock;
467 struct inode *vi;
468 ntfs_inode *ni;
469 ntfs_volume *vol;
470 runlist_element *rl;
471 struct buffer_head *bh, *head;
472 unsigned int blocksize, vcn_ofs;
473 int err;
474 BOOL need_end_writeback;
475 unsigned char blocksize_bits;
476
477 vi = page->mapping->host;
478 ni = NTFS_I(vi);
479 vol = ni->vol;
480
481 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
482 "0x%lx.", ni->mft_no, ni->type, page->index);
483
484 BUG_ON(!NInoNonResident(ni));
485 BUG_ON(NInoMstProtected(ni));
486
487 blocksize_bits = vi->i_blkbits;
488 blocksize = 1 << blocksize_bits;
489
490 if (!page_has_buffers(page)) {
491 BUG_ON(!PageUptodate(page));
492 create_empty_buffers(page, blocksize,
493 (1 << BH_Uptodate) | (1 << BH_Dirty));
494 }
495 bh = head = page_buffers(page);
496 if (unlikely(!bh)) {
497 ntfs_warning(vol->sb, "Error allocating page buffers. "
498 "Redirtying page so we try again later.");
499 /*
500 * Put the page back on mapping->dirty_pages, but leave its
501 * buffer's dirty state as-is.
502 */
503 redirty_page_for_writepage(wbc, page);
504 unlock_page(page);
505 return 0;
506 }
507
508 /* NOTE: Different naming scheme to ntfs_read_block()! */
509
510 /* The first block in the page. */
511 block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
512
513 /* The first out of bounds block for the data size. */
514 dblock = (vi->i_size + blocksize - 1) >> blocksize_bits;
515
516 /* The last (fully or partially) initialized block. */
517 iblock = ni->initialized_size >> blocksize_bits;
518
519 /*
520 * Be very careful. We have no exclusion from __set_page_dirty_buffers
521 * here, and the (potentially unmapped) buffers may become dirty at
522 * any time. If a buffer becomes dirty here after we've inspected it
523 * then we just miss that fact, and the page stays dirty.
524 *
525 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
526 * handle that here by just cleaning them.
527 */
528
529 /*
530 * Loop through all the buffers in the page, mapping all the dirty
531 * buffers to disk addresses and handling any aliases from the
532 * underlying block device's mapping.
533 */
534 rl = NULL;
535 err = 0;
536 do {
537 BOOL is_retry = FALSE;
538
539 if (unlikely(block >= dblock)) {
540 /*
541 * Mapped buffers outside i_size will occur, because
542 * this page can be outside i_size when there is a
543 * truncate in progress. The contents of such buffers
544 * were zeroed by ntfs_writepage().
545 *
546 * FIXME: What about the small race window where
547 * ntfs_writepage() has not done any clearing because
548 * the page was within i_size but before we get here,
549 * vmtruncate() modifies i_size?
550 */
551 clear_buffer_dirty(bh);
552 set_buffer_uptodate(bh);
553 continue;
554 }
555
556 /* Clean buffers are not written out, so no need to map them. */
557 if (!buffer_dirty(bh))
558 continue;
559
560 /* Make sure we have enough initialized size. */
561 if (unlikely((block >= iblock) &&
562 (ni->initialized_size < vi->i_size))) {
563 /*
564 * If this page is fully outside initialized size, zero
565 * out all pages between the current initialized size
566 * and the current page. Just use ntfs_readpage() to do
567 * the zeroing transparently.
568 */
569 if (block > iblock) {
570 // TODO:
571 // For each page do:
572 // - read_cache_page()
573 // Again for each page do:
574 // - wait_on_page_locked()
575 // - Check (PageUptodate(page) &&
576 // !PageError(page))
577 // Update initialized size in the attribute and
578 // in the inode.
579 // Again, for each page do:
580 // __set_page_dirty_buffers();
581 // page_cache_release()
582 // We don't need to wait on the writes.
583 // Update iblock.
584 }
585 /*
586 * The current page straddles initialized size. Zero
587 * all non-uptodate buffers and set them uptodate (and
588 * dirty?). Note, there aren't any non-uptodate buffers
589 * if the page is uptodate.
590 * FIXME: For an uptodate page, the buffers may need to
591 * be written out because they were not initialized on
592 * disk before.
593 */
594 if (!PageUptodate(page)) {
595 // TODO:
596 // Zero any non-uptodate buffers up to i_size.
597 // Set them uptodate and dirty.
598 }
599 // TODO:
600 // Update initialized size in the attribute and in the
601 // inode (up to i_size).
602 // Update iblock.
603 // FIXME: This is inefficient. Try to batch the two
604 // size changes to happen in one go.
605 ntfs_error(vol->sb, "Writing beyond initialized size "
606 "is not supported yet. Sorry.");
607 err = -EOPNOTSUPP;
608 break;
609 // Do NOT set_buffer_new() BUT DO clear buffer range
610 // outside write request range.
611 // set_buffer_uptodate() on complete buffers as well as
612 // set_buffer_dirty().
613 }
614
615 /* No need to map buffers that are already mapped. */
616 if (buffer_mapped(bh))
617 continue;
618
619 /* Unmapped, dirty buffer. Need to map it. */
620 bh->b_bdev = vol->sb->s_bdev;
621
622 /* Convert block into corresponding vcn and offset. */
623 vcn = (VCN)block << blocksize_bits;
624 vcn_ofs = vcn & vol->cluster_size_mask;
625 vcn >>= vol->cluster_size_bits;
626 if (!rl) {
627lock_retry_remap:
628 down_read(&ni->runlist.lock);
629 rl = ni->runlist.rl;
630 }
631 if (likely(rl != NULL)) {
632 /* Seek to element containing target vcn. */
633 while (rl->length && rl[1].vcn <= vcn)
634 rl++;
635 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
636 } else
637 lcn = LCN_RL_NOT_MAPPED;
638 /* Successful remap. */
639 if (lcn >= 0) {
640 /* Setup buffer head to point to correct block. */
641 bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
642 vcn_ofs) >> blocksize_bits;
643 set_buffer_mapped(bh);
644 continue;
645 }
646 /* It is a hole, need to instantiate it. */
647 if (lcn == LCN_HOLE) {
648 // TODO: Instantiate the hole.
649 // clear_buffer_new(bh);
650 // unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
651 ntfs_error(vol->sb, "Writing into sparse regions is "
652 "not supported yet. Sorry.");
653 err = -EOPNOTSUPP;
654 break;
655 }
656 /* If first try and runlist unmapped, map and retry. */
657 if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
658 is_retry = TRUE;
659 /*
660 * Attempt to map runlist, dropping lock for
661 * the duration.
662 */
663 up_read(&ni->runlist.lock);
664 err = ntfs_map_runlist(ni, vcn);
665 if (likely(!err))
666 goto lock_retry_remap;
667 rl = NULL;
668 lcn = err;
669 }
670 /* Failed to map the buffer, even after retrying. */
671 bh->b_blocknr = -1;
672 ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
673 "attribute type 0x%x, vcn 0x%llx, offset 0x%x "
674 "because its location on disk could not be "
675 "determined%s (error code %lli).", ni->mft_no,
676 ni->type, (unsigned long long)vcn,
677 vcn_ofs, is_retry ? " even after "
678 "retrying" : "", (long long)lcn);
679 if (!err)
680 err = -EIO;
681 break;
682 } while (block++, (bh = bh->b_this_page) != head);
683
684 /* Release the lock if we took it. */
685 if (rl)
686 up_read(&ni->runlist.lock);
687
688 /* For the error case, need to reset bh to the beginning. */
689 bh = head;
690
691 /* Just an optimization, so ->readpage() isn't called later. */
692 if (unlikely(!PageUptodate(page))) {
693 int uptodate = 1;
694 do {
695 if (!buffer_uptodate(bh)) {
696 uptodate = 0;
697 bh = head;
698 break;
699 }
700 } while ((bh = bh->b_this_page) != head);
701 if (uptodate)
702 SetPageUptodate(page);
703 }
704
705 /* Setup all mapped, dirty buffers for async write i/o. */
706 do {
707 get_bh(bh);
708 if (buffer_mapped(bh) && buffer_dirty(bh)) {
709 lock_buffer(bh);
710 if (test_clear_buffer_dirty(bh)) {
711 BUG_ON(!buffer_uptodate(bh));
712 mark_buffer_async_write(bh);
713 } else
714 unlock_buffer(bh);
715 } else if (unlikely(err)) {
716 /*
717 * For the error case. The buffer may have been set
718 * dirty during attachment to a dirty page.
719 */
720 if (err != -ENOMEM)
721 clear_buffer_dirty(bh);
722 }
723 } while ((bh = bh->b_this_page) != head);
724
725 if (unlikely(err)) {
726 // TODO: Remove the -EOPNOTSUPP check later on...
727 if (unlikely(err == -EOPNOTSUPP))
728 err = 0;
729 else if (err == -ENOMEM) {
730 ntfs_warning(vol->sb, "Error allocating memory. "
731 "Redirtying page so we try again "
732 "later.");
733 /*
734 * Put the page back on mapping->dirty_pages, but
735 * leave its buffer's dirty state as-is.
736 */
737 redirty_page_for_writepage(wbc, page);
738 err = 0;
739 } else
740 SetPageError(page);
741 }
742
743 BUG_ON(PageWriteback(page));
744 set_page_writeback(page); /* Keeps try_to_free_buffers() away. */
745 unlock_page(page);
746
747 /*
748 * Submit the prepared buffers for i/o. Note the page is unlocked,
749 * and the async write i/o completion handler can end_page_writeback()
750 * at any time after the *first* submit_bh(). So the buffers can then
751 * disappear...
752 */
753 need_end_writeback = TRUE;
754 do {
755 struct buffer_head *next = bh->b_this_page;
756 if (buffer_async_write(bh)) {
757 submit_bh(WRITE, bh);
758 need_end_writeback = FALSE;
759 }
760 put_bh(bh);
761 bh = next;
762 } while (bh != head);
763
764 /* If no i/o was started, need to end_page_writeback(). */
765 if (unlikely(need_end_writeback))
766 end_page_writeback(page);
767
768 ntfs_debug("Done.");
769 return err;
770}
771
772/**
773 * ntfs_write_mst_block - write a @page to the backing store
774 * @page: page cache page to write out
775 * @wbc: writeback control structure
776 *
777 * This function is for writing pages belonging to non-resident, mst protected
778 * attributes to their backing store. The only supported attributes are index
779 * allocation and $MFT/$DATA. Both directory inodes and index inodes are
780 * supported for the index allocation case.
781 *
782 * The page must remain locked for the duration of the write because we apply
783 * the mst fixups, write, and then undo the fixups, so if we were to unlock the
784 * page before undoing the fixups, any other user of the page will see the
785 * page contents as corrupt.
786 *
787 * We clear the page uptodate flag for the duration of the function to ensure
788 * exclusion for the $MFT/$DATA case against someone mapping an mft record we
789 * are about to apply the mst fixups to.
790 *
791 * Return 0 on success and -errno on error.
792 *
793 * Based on ntfs_write_block(), ntfs_mft_writepage(), and
794 * write_mft_record_nolock().
795 */
796static int ntfs_write_mst_block(struct page *page,
797 struct writeback_control *wbc)
798{
799 sector_t block, dblock, rec_block;
800 struct inode *vi = page->mapping->host;
801 ntfs_inode *ni = NTFS_I(vi);
802 ntfs_volume *vol = ni->vol;
803 u8 *kaddr;
804 unsigned char bh_size_bits = vi->i_blkbits;
805 unsigned int bh_size = 1 << bh_size_bits;
806 unsigned int rec_size = ni->itype.index.block_size;
807 ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size];
808 struct buffer_head *bh, *head, *tbh, *rec_start_bh;
809 int max_bhs = PAGE_CACHE_SIZE / bh_size;
810 struct buffer_head *bhs[max_bhs];
811 runlist_element *rl;
812 int i, nr_locked_nis, nr_recs, nr_bhs, bhs_per_rec, err, err2;
813 unsigned rec_size_bits;
814 BOOL sync, is_mft, page_is_dirty, rec_is_dirty;
815
816 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
817 "0x%lx.", vi->i_ino, ni->type, page->index);
818 BUG_ON(!NInoNonResident(ni));
819 BUG_ON(!NInoMstProtected(ni));
820 is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
821 /*
822 * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
823 * in its page cache were to be marked dirty. However this should
824 * never happen with the current driver and considering we do not
825 * handle this case here we do want to BUG(), at least for now.
826 */
827 BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
828 (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
829 BUG_ON(!max_bhs);
830
831 /* Were we called for sync purposes? */
832 sync = (wbc->sync_mode == WB_SYNC_ALL);
833
834 /* Make sure we have mapped buffers. */
835 BUG_ON(!page_has_buffers(page));
836 bh = head = page_buffers(page);
837 BUG_ON(!bh);
838
839 rec_size_bits = ni->itype.index.block_size_bits;
840 BUG_ON(!(PAGE_CACHE_SIZE >> rec_size_bits));
841 bhs_per_rec = rec_size >> bh_size_bits;
842 BUG_ON(!bhs_per_rec);
843
844 /* The first block in the page. */
845 rec_block = block = (sector_t)page->index <<
846 (PAGE_CACHE_SHIFT - bh_size_bits);
847
848 /* The first out of bounds block for the data size. */
849 dblock = (vi->i_size + bh_size - 1) >> bh_size_bits;
850
851 rl = NULL;
852 err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
853 page_is_dirty = rec_is_dirty = FALSE;
854 rec_start_bh = NULL;
855 do {
856 BOOL is_retry = FALSE;
857
858 if (likely(block < rec_block)) {
859 if (unlikely(block >= dblock)) {
860 clear_buffer_dirty(bh);
861 continue;
862 }
863 /*
864 * This block is not the first one in the record. We
865 * ignore the buffer's dirty state because we could
866 * have raced with a parallel mark_ntfs_record_dirty().
867 */
868 if (!rec_is_dirty)
869 continue;
870 if (unlikely(err2)) {
871 if (err2 != -ENOMEM)
872 clear_buffer_dirty(bh);
873 continue;
874 }
875 } else /* if (block == rec_block) */ {
876 BUG_ON(block > rec_block);
877 /* This block is the first one in the record. */
878 rec_block += bhs_per_rec;
879 err2 = 0;
880 if (unlikely(block >= dblock)) {
881 clear_buffer_dirty(bh);
882 continue;
883 }
884 if (!buffer_dirty(bh)) {
885 /* Clean records are not written out. */
886 rec_is_dirty = FALSE;
887 continue;
888 }
889 rec_is_dirty = TRUE;
890 rec_start_bh = bh;
891 }
892 /* Need to map the buffer if it is not mapped already. */
893 if (unlikely(!buffer_mapped(bh))) {
894 VCN vcn;
895 LCN lcn;
896 unsigned int vcn_ofs;
897
898 /* Obtain the vcn and offset of the current block. */
899 vcn = (VCN)block << bh_size_bits;
900 vcn_ofs = vcn & vol->cluster_size_mask;
901 vcn >>= vol->cluster_size_bits;
902 if (!rl) {
903lock_retry_remap:
904 down_read(&ni->runlist.lock);
905 rl = ni->runlist.rl;
906 }
907 if (likely(rl != NULL)) {
908 /* Seek to element containing target vcn. */
909 while (rl->length && rl[1].vcn <= vcn)
910 rl++;
911 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
912 } else
913 lcn = LCN_RL_NOT_MAPPED;
914 /* Successful remap. */
915 if (likely(lcn >= 0)) {
916 /* Setup buffer head to correct block. */
917 bh->b_blocknr = ((lcn <<
918 vol->cluster_size_bits) +
919 vcn_ofs) >> bh_size_bits;
920 set_buffer_mapped(bh);
921 } else {
922 /*
923 * Remap failed. Retry to map the runlist once
924 * unless we are working on $MFT which always
925 * has the whole of its runlist in memory.
926 */
927 if (!is_mft && !is_retry &&
928 lcn == LCN_RL_NOT_MAPPED) {
929 is_retry = TRUE;
930 /*
931 * Attempt to map runlist, dropping
932 * lock for the duration.
933 */
934 up_read(&ni->runlist.lock);
935 err2 = ntfs_map_runlist(ni, vcn);
936 if (likely(!err2))
937 goto lock_retry_remap;
938 if (err2 == -ENOMEM)
939 page_is_dirty = TRUE;
940 lcn = err2;
941 } else
942 err2 = -EIO;
943 /* Hard error. Abort writing this record. */
944 if (!err || err == -ENOMEM)
945 err = err2;
946 bh->b_blocknr = -1;
947 ntfs_error(vol->sb, "Cannot write ntfs record "
948 "0x%llx (inode 0x%lx, "
949 "attribute type 0x%x) because "
950 "its location on disk could "
951 "not be determined (error "
952 "code %lli).", (s64)block <<
953 bh_size_bits >>
954 vol->mft_record_size_bits,
955 ni->mft_no, ni->type,
956 (long long)lcn);
957 /*
958 * If this is not the first buffer, remove the
959 * buffers in this record from the list of
960 * buffers to write and clear their dirty bit
961 * if not error -ENOMEM.
962 */
963 if (rec_start_bh != bh) {
964 while (bhs[--nr_bhs] != rec_start_bh)
965 ;
966 if (err2 != -ENOMEM) {
967 do {
968 clear_buffer_dirty(
969 rec_start_bh);
970 } while ((rec_start_bh =
971 rec_start_bh->
972 b_this_page) !=
973 bh);
974 }
975 }
976 continue;
977 }
978 }
979 BUG_ON(!buffer_uptodate(bh));
980 BUG_ON(nr_bhs >= max_bhs);
981 bhs[nr_bhs++] = bh;
982 } while (block++, (bh = bh->b_this_page) != head);
983 if (unlikely(rl))
984 up_read(&ni->runlist.lock);
985 /* If there were no dirty buffers, we are done. */
986 if (!nr_bhs)
987 goto done;
988 /* Map the page so we can access its contents. */
989 kaddr = kmap(page);
990 /* Clear the page uptodate flag whilst the mst fixups are applied. */
991 BUG_ON(!PageUptodate(page));
992 ClearPageUptodate(page);
993 for (i = 0; i < nr_bhs; i++) {
994 unsigned int ofs;
995
996 /* Skip buffers which are not at the beginning of records. */
997 if (i % bhs_per_rec)
998 continue;
999 tbh = bhs[i];
1000 ofs = bh_offset(tbh);
1001 if (is_mft) {
1002 ntfs_inode *tni;
1003 unsigned long mft_no;
1004
1005 /* Get the mft record number. */
1006 mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
1007 >> rec_size_bits;
1008 /* Check whether to write this mft record. */
1009 tni = NULL;
1010 if (!ntfs_may_write_mft_record(vol, mft_no,
1011 (MFT_RECORD*)(kaddr + ofs), &tni)) {
1012 /*
1013 * The record should not be written. This
1014 * means we need to redirty the page before
1015 * returning.
1016 */
1017 page_is_dirty = TRUE;
1018 /*
1019 * Remove the buffers in this mft record from
1020 * the list of buffers to write.
1021 */
1022 do {
1023 bhs[i] = NULL;
1024 } while (++i % bhs_per_rec);
1025 continue;
1026 }
1027 /*
1028 * The record should be written. If a locked ntfs
1029 * inode was returned, add it to the array of locked
1030 * ntfs inodes.
1031 */
1032 if (tni)
1033 locked_nis[nr_locked_nis++] = tni;
1034 }
1035 /* Apply the mst protection fixups. */
1036 err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
1037 rec_size);
1038 if (unlikely(err2)) {
1039 if (!err || err == -ENOMEM)
1040 err = -EIO;
1041 ntfs_error(vol->sb, "Failed to apply mst fixups "
1042 "(inode 0x%lx, attribute type 0x%x, "
1043 "page index 0x%lx, page offset 0x%x)!"
1044 " Unmount and run chkdsk.", vi->i_ino,
1045 ni->type, page->index, ofs);
1046 /*
1047 * Mark all the buffers in this record clean as we do
1048 * not want to write corrupt data to disk.
1049 */
1050 do {
1051 clear_buffer_dirty(bhs[i]);
1052 bhs[i] = NULL;
1053 } while (++i % bhs_per_rec);
1054 continue;
1055 }
1056 nr_recs++;
1057 }
1058 /* If no records are to be written out, we are done. */
1059 if (!nr_recs)
1060 goto unm_done;
1061 flush_dcache_page(page);
1062 /* Lock buffers and start synchronous write i/o on them. */
1063 for (i = 0; i < nr_bhs; i++) {
1064 tbh = bhs[i];
1065 if (!tbh)
1066 continue;
1067 if (unlikely(test_set_buffer_locked(tbh)))
1068 BUG();
1069 /* The buffer dirty state is now irrelevant, just clean it. */
1070 clear_buffer_dirty(tbh);
1071 BUG_ON(!buffer_uptodate(tbh));
1072 BUG_ON(!buffer_mapped(tbh));
1073 get_bh(tbh);
1074 tbh->b_end_io = end_buffer_write_sync;
1075 submit_bh(WRITE, tbh);
1076 }
1077 /* Synchronize the mft mirror now if not @sync. */
1078 if (is_mft && !sync)
1079 goto do_mirror;
1080do_wait:
1081 /* Wait on i/o completion of buffers. */
1082 for (i = 0; i < nr_bhs; i++) {
1083 tbh = bhs[i];
1084 if (!tbh)
1085 continue;
1086 wait_on_buffer(tbh);
1087 if (unlikely(!buffer_uptodate(tbh))) {
1088 ntfs_error(vol->sb, "I/O error while writing ntfs "
1089 "record buffer (inode 0x%lx, "
1090 "attribute type 0x%x, page index "
1091 "0x%lx, page offset 0x%lx)! Unmount "
1092 "and run chkdsk.", vi->i_ino, ni->type,
1093 page->index, bh_offset(tbh));
1094 if (!err || err == -ENOMEM)
1095 err = -EIO;
1096 /*
1097 * Set the buffer uptodate so the page and buffer
1098 * states do not become out of sync.
1099 */
1100 set_buffer_uptodate(tbh);
1101 }
1102 }
1103 /* If @sync, now synchronize the mft mirror. */
1104 if (is_mft && sync) {
1105do_mirror:
1106 for (i = 0; i < nr_bhs; i++) {
1107 unsigned long mft_no;
1108 unsigned int ofs;
1109
1110 /*
1111 * Skip buffers which are not at the beginning of
1112 * records.
1113 */
1114 if (i % bhs_per_rec)
1115 continue;
1116 tbh = bhs[i];
1117 /* Skip removed buffers (and hence records). */
1118 if (!tbh)
1119 continue;
1120 ofs = bh_offset(tbh);
1121 /* Get the mft record number. */
1122 mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
1123 >> rec_size_bits;
1124 if (mft_no < vol->mftmirr_size)
1125 ntfs_sync_mft_mirror(vol, mft_no,
1126 (MFT_RECORD*)(kaddr + ofs),
1127 sync);
1128 }
1129 if (!sync)
1130 goto do_wait;
1131 }
1132 /* Remove the mst protection fixups again. */
1133 for (i = 0; i < nr_bhs; i++) {
1134 if (!(i % bhs_per_rec)) {
1135 tbh = bhs[i];
1136 if (!tbh)
1137 continue;
1138 post_write_mst_fixup((NTFS_RECORD*)(kaddr +
1139 bh_offset(tbh)));
1140 }
1141 }
1142 flush_dcache_page(page);
1143unm_done:
1144 /* Unlock any locked inodes. */
1145 while (nr_locked_nis-- > 0) {
1146 ntfs_inode *tni, *base_tni;
1147
1148 tni = locked_nis[nr_locked_nis];
1149 /* Get the base inode. */
1150 down(&tni->extent_lock);
1151 if (tni->nr_extents >= 0)
1152 base_tni = tni;
1153 else {
1154 base_tni = tni->ext.base_ntfs_ino;
1155 BUG_ON(!base_tni);
1156 }
1157 up(&tni->extent_lock);
1158 ntfs_debug("Unlocking %s inode 0x%lx.",
1159 tni == base_tni ? "base" : "extent",
1160 tni->mft_no);
1161 up(&tni->mrec_lock);
1162 atomic_dec(&tni->count);
1163 iput(VFS_I(base_tni));
1164 }
1165 SetPageUptodate(page);
1166 kunmap(page);
1167done:
1168 if (unlikely(err && err != -ENOMEM)) {
1169 /*
1170 * Set page error if there is only one ntfs record in the page.
1171 * Otherwise we would loose per-record granularity.
1172 */
1173 if (ni->itype.index.block_size == PAGE_CACHE_SIZE)
1174 SetPageError(page);
1175 NVolSetErrors(vol);
1176 }
1177 if (page_is_dirty) {
1178 ntfs_debug("Page still contains one or more dirty ntfs "
1179 "records. Redirtying the page starting at "
1180 "record 0x%lx.", page->index <<
1181 (PAGE_CACHE_SHIFT - rec_size_bits));
1182 redirty_page_for_writepage(wbc, page);
1183 unlock_page(page);
1184 } else {
1185 /*
1186 * Keep the VM happy. This must be done otherwise the
1187 * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
1188 * the page is clean.
1189 */
1190 BUG_ON(PageWriteback(page));
1191 set_page_writeback(page);
1192 unlock_page(page);
1193 end_page_writeback(page);
1194 }
1195 if (likely(!err))
1196 ntfs_debug("Done.");
1197 return err;
1198}
1199
1200/**
1201 * ntfs_writepage - write a @page to the backing store
1202 * @page: page cache page to write out
1203 * @wbc: writeback control structure
1204 *
1205 * This is called from the VM when it wants to have a dirty ntfs page cache
1206 * page cleaned. The VM has already locked the page and marked it clean.
1207 *
1208 * For non-resident attributes, ntfs_writepage() writes the @page by calling
1209 * the ntfs version of the generic block_write_full_page() function,
1210 * ntfs_write_block(), which in turn if necessary creates and writes the
1211 * buffers associated with the page asynchronously.
1212 *
1213 * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
1214 * the data to the mft record (which at this stage is most likely in memory).
1215 * The mft record is then marked dirty and written out asynchronously via the
1216 * vfs inode dirty code path for the inode the mft record belongs to or via the
1217 * vm page dirty code path for the page the mft record is in.
1218 *
1219 * Based on ntfs_readpage() and fs/buffer.c::block_write_full_page().
1220 *
1221 * Return 0 on success and -errno on error.
1222 */
1223static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
1224{
1225 loff_t i_size;
1226 struct inode *vi;
1227 ntfs_inode *ni, *base_ni;
1228 char *kaddr;
1229 ntfs_attr_search_ctx *ctx;
1230 MFT_RECORD *m;
1231 u32 attr_len;
1232 int err;
1233
1234 BUG_ON(!PageLocked(page));
1235
1236 vi = page->mapping->host;
1237 i_size = i_size_read(vi);
1238
1239 /* Is the page fully outside i_size? (truncate in progress) */
1240 if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
1241 PAGE_CACHE_SHIFT)) {
1242 /*
1243 * The page may have dirty, unmapped buffers. Make them
1244 * freeable here, so the page does not leak.
1245 */
1246 block_invalidatepage(page, 0);
1247 unlock_page(page);
1248 ntfs_debug("Write outside i_size - truncated?");
1249 return 0;
1250 }
1251 ni = NTFS_I(vi);
1252
1253 /* NInoNonResident() == NInoIndexAllocPresent() */
1254 if (NInoNonResident(ni)) {
1255 /*
1256 * Only unnamed $DATA attributes can be compressed, encrypted,
1257 * and/or sparse.
1258 */
1259 if (ni->type == AT_DATA && !ni->name_len) {
1260 /* If file is encrypted, deny access, just like NT4. */
1261 if (NInoEncrypted(ni)) {
1262 unlock_page(page);
1263 ntfs_debug("Denying write access to encrypted "
1264 "file.");
1265 return -EACCES;
1266 }
1267 /* Compressed data streams are handled in compress.c. */
1268 if (NInoCompressed(ni)) {
1269 // TODO: Implement and replace this check with
1270 // return ntfs_write_compressed_block(page);
1271 unlock_page(page);
1272 ntfs_error(vi->i_sb, "Writing to compressed "
1273 "files is not supported yet. "
1274 "Sorry.");
1275 return -EOPNOTSUPP;
1276 }
1277 // TODO: Implement and remove this check.
1278 if (NInoSparse(ni)) {
1279 unlock_page(page);
1280 ntfs_error(vi->i_sb, "Writing to sparse files "
1281 "is not supported yet. Sorry.");
1282 return -EOPNOTSUPP;
1283 }
1284 }
1285 /* We have to zero every time due to mmap-at-end-of-file. */
1286 if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) {
1287 /* The page straddles i_size. */
1288 unsigned int ofs = i_size & ~PAGE_CACHE_MASK;
1289 kaddr = kmap_atomic(page, KM_USER0);
1290 memset(kaddr + ofs, 0, PAGE_CACHE_SIZE - ofs);
1291 flush_dcache_page(page);
1292 kunmap_atomic(kaddr, KM_USER0);
1293 }
1294 /* Handle mst protected attributes. */
1295 if (NInoMstProtected(ni))
1296 return ntfs_write_mst_block(page, wbc);
1297 /* Normal data stream. */
1298 return ntfs_write_block(page, wbc);
1299 }
1300 /*
1301 * Attribute is resident, implying it is not compressed, encrypted,
1302 * sparse, or mst protected. This also means the attribute is smaller
1303 * than an mft record and hence smaller than a page, so can simply
1304 * return error on any pages with index above 0.
1305 */
1306 BUG_ON(page_has_buffers(page));
1307 BUG_ON(!PageUptodate(page));
1308 if (unlikely(page->index > 0)) {
1309 ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0. "
1310 "Aborting write.", page->index);
1311 BUG_ON(PageWriteback(page));
1312 set_page_writeback(page);
1313 unlock_page(page);
1314 end_page_writeback(page);
1315 return -EIO;
1316 }
1317 if (!NInoAttr(ni))
1318 base_ni = ni;
1319 else
1320 base_ni = ni->ext.base_ntfs_ino;
1321 /* Map, pin, and lock the mft record. */
1322 m = map_mft_record(base_ni);
1323 if (IS_ERR(m)) {
1324 err = PTR_ERR(m);
1325 m = NULL;
1326 ctx = NULL;
1327 goto err_out;
1328 }
1329 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1330 if (unlikely(!ctx)) {
1331 err = -ENOMEM;
1332 goto err_out;
1333 }
1334 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1335 CASE_SENSITIVE, 0, NULL, 0, ctx);
1336 if (unlikely(err))
1337 goto err_out;
1338 /*
1339 * Keep the VM happy. This must be done otherwise the radix-tree tag
1340 * PAGECACHE_TAG_DIRTY remains set even though the page is clean.
1341 */
1342 BUG_ON(PageWriteback(page));
1343 set_page_writeback(page);
1344 unlock_page(page);
1345
1346 /*
1347 * Here, we don't need to zero the out of bounds area everytime because
1348 * the below memcpy() already takes care of the mmap-at-end-of-file
1349 * requirements. If the file is converted to a non-resident one, then
1350 * the code path use is switched to the non-resident one where the
1351 * zeroing happens on each ntfs_writepage() invocation.
1352 *
1353 * The above also applies nicely when i_size is decreased.
1354 *
1355 * When i_size is increased, the memory between the old and new i_size
1356 * _must_ be zeroed (or overwritten with new data). Otherwise we will
1357 * expose data to userspace/disk which should never have been exposed.
1358 *
1359 * FIXME: Ensure that i_size increases do the zeroing/overwriting and
1360 * if we cannot guarantee that, then enable the zeroing below. If the
1361 * zeroing below is enabled, we MUST move the unlock_page() from above
1362 * to after the kunmap_atomic(), i.e. just before the
1363 * end_page_writeback().
1364 * UPDATE: ntfs_prepare/commit_write() do the zeroing on i_size
1365 * increases for resident attributes so those are ok.
1366 * TODO: ntfs_truncate(), others?
1367 */
1368
1369 attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
1370 i_size = i_size_read(VFS_I(ni));
1371 kaddr = kmap_atomic(page, KM_USER0);
1372 if (unlikely(attr_len > i_size)) {
1373 /* Zero out of bounds area in the mft record. */
1374 memset((u8*)ctx->attr + le16_to_cpu(
1375 ctx->attr->data.resident.value_offset) +
1376 i_size, 0, attr_len - i_size);
1377 attr_len = i_size;
1378 }
1379 /* Copy the data from the page to the mft record. */
1380 memcpy((u8*)ctx->attr +
1381 le16_to_cpu(ctx->attr->data.resident.value_offset),
1382 kaddr, attr_len);
1383 flush_dcache_mft_record_page(ctx->ntfs_ino);
1384 /* Zero out of bounds area in the page cache page. */
1385 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
1386 flush_dcache_page(page);
1387 kunmap_atomic(kaddr, KM_USER0);
1388
1389 end_page_writeback(page);
1390
1391 /* Mark the mft record dirty, so it gets written back. */
1392 mark_mft_record_dirty(ctx->ntfs_ino);
1393 ntfs_attr_put_search_ctx(ctx);
1394 unmap_mft_record(base_ni);
1395 return 0;
1396err_out:
1397 if (err == -ENOMEM) {
1398 ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
1399 "page so we try again later.");
1400 /*
1401 * Put the page back on mapping->dirty_pages, but leave its
1402 * buffers' dirty state as-is.
1403 */
1404 redirty_page_for_writepage(wbc, page);
1405 err = 0;
1406 } else {
1407 ntfs_error(vi->i_sb, "Resident attribute write failed with "
1408 "error %i. Setting page error flag.", err);
1409 SetPageError(page);
1410 }
1411 unlock_page(page);
1412 if (ctx)
1413 ntfs_attr_put_search_ctx(ctx);
1414 if (m)
1415 unmap_mft_record(base_ni);
1416 return err;
1417}
1418
1419/**
1420 * ntfs_prepare_nonresident_write -
1421 *
1422 */
1423static int ntfs_prepare_nonresident_write(struct page *page,
1424 unsigned from, unsigned to)
1425{
1426 VCN vcn;
1427 LCN lcn;
1428 sector_t block, ablock, iblock;
1429 struct inode *vi;
1430 ntfs_inode *ni;
1431 ntfs_volume *vol;
1432 runlist_element *rl;
1433 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
1434 unsigned int vcn_ofs, block_start, block_end, blocksize;
1435 int err;
1436 BOOL is_retry;
1437 unsigned char blocksize_bits;
1438
1439 vi = page->mapping->host;
1440 ni = NTFS_I(vi);
1441 vol = ni->vol;
1442
1443 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1444 "0x%lx, from = %u, to = %u.", ni->mft_no, ni->type,
1445 page->index, from, to);
1446
1447 BUG_ON(!NInoNonResident(ni));
1448
1449 blocksize_bits = vi->i_blkbits;
1450 blocksize = 1 << blocksize_bits;
1451
1452 /*
1453 * create_empty_buffers() will create uptodate/dirty buffers if the
1454 * page is uptodate/dirty.
1455 */
1456 if (!page_has_buffers(page))
1457 create_empty_buffers(page, blocksize, 0);
1458 bh = head = page_buffers(page);
1459 if (unlikely(!bh))
1460 return -ENOMEM;
1461
1462 /* The first block in the page. */
1463 block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
1464
1465 /*
1466 * The first out of bounds block for the allocated size. No need to
1467 * round up as allocated_size is in multiples of cluster size and the
1468 * minimum cluster size is 512 bytes, which is equal to the smallest
1469 * blocksize.
1470 */
1471 ablock = ni->allocated_size >> blocksize_bits;
1472
1473 /* The last (fully or partially) initialized block. */
1474 iblock = ni->initialized_size >> blocksize_bits;
1475
1476 /* Loop through all the buffers in the page. */
1477 block_start = 0;
1478 rl = NULL;
1479 err = 0;
1480 do {
1481 block_end = block_start + blocksize;
1482 /*
1483 * If buffer @bh is outside the write, just mark it uptodate
1484 * if the page is uptodate and continue with the next buffer.
1485 */
1486 if (block_end <= from || block_start >= to) {
1487 if (PageUptodate(page)) {
1488 if (!buffer_uptodate(bh))
1489 set_buffer_uptodate(bh);
1490 }
1491 continue;
1492 }
1493 /*
1494 * @bh is at least partially being written to.
1495 * Make sure it is not marked as new.
1496 */
1497 //if (buffer_new(bh))
1498 // clear_buffer_new(bh);
1499
1500 if (block >= ablock) {
1501 // TODO: block is above allocated_size, need to
1502 // allocate it. Best done in one go to accommodate not
1503 // only block but all above blocks up to and including:
1504 // ((page->index << PAGE_CACHE_SHIFT) + to + blocksize
1505 // - 1) >> blobksize_bits. Obviously will need to round
1506 // up to next cluster boundary, too. This should be
1507 // done with a helper function, so it can be reused.
1508 ntfs_error(vol->sb, "Writing beyond allocated size "
1509 "is not supported yet. Sorry.");
1510 err = -EOPNOTSUPP;
1511 goto err_out;
1512 // Need to update ablock.
1513 // Need to set_buffer_new() on all block bhs that are
1514 // newly allocated.
1515 }
1516 /*
1517 * Now we have enough allocated size to fulfill the whole
1518 * request, i.e. block < ablock is true.
1519 */
1520 if (unlikely((block >= iblock) &&
1521 (ni->initialized_size < vi->i_size))) {
1522 /*
1523 * If this page is fully outside initialized size, zero
1524 * out all pages between the current initialized size
1525 * and the current page. Just use ntfs_readpage() to do
1526 * the zeroing transparently.
1527 */
1528 if (block > iblock) {
1529 // TODO:
1530 // For each page do:
1531 // - read_cache_page()
1532 // Again for each page do:
1533 // - wait_on_page_locked()
1534 // - Check (PageUptodate(page) &&
1535 // !PageError(page))
1536 // Update initialized size in the attribute and
1537 // in the inode.
1538 // Again, for each page do:
1539 // __set_page_dirty_buffers();
1540 // page_cache_release()
1541 // We don't need to wait on the writes.
1542 // Update iblock.
1543 }
1544 /*
1545 * The current page straddles initialized size. Zero
1546 * all non-uptodate buffers and set them uptodate (and
1547 * dirty?). Note, there aren't any non-uptodate buffers
1548 * if the page is uptodate.
1549 * FIXME: For an uptodate page, the buffers may need to
1550 * be written out because they were not initialized on
1551 * disk before.
1552 */
1553 if (!PageUptodate(page)) {
1554 // TODO:
1555 // Zero any non-uptodate buffers up to i_size.
1556 // Set them uptodate and dirty.
1557 }
1558 // TODO:
1559 // Update initialized size in the attribute and in the
1560 // inode (up to i_size).
1561 // Update iblock.
1562 // FIXME: This is inefficient. Try to batch the two
1563 // size changes to happen in one go.
1564 ntfs_error(vol->sb, "Writing beyond initialized size "
1565 "is not supported yet. Sorry.");
1566 err = -EOPNOTSUPP;
1567 goto err_out;
1568 // Do NOT set_buffer_new() BUT DO clear buffer range
1569 // outside write request range.
1570 // set_buffer_uptodate() on complete buffers as well as
1571 // set_buffer_dirty().
1572 }
1573
1574 /* Need to map unmapped buffers. */
1575 if (!buffer_mapped(bh)) {
1576 /* Unmapped buffer. Need to map it. */
1577 bh->b_bdev = vol->sb->s_bdev;
1578
1579 /* Convert block into corresponding vcn and offset. */
1580 vcn = (VCN)block << blocksize_bits >>
1581 vol->cluster_size_bits;
1582 vcn_ofs = ((VCN)block << blocksize_bits) &
1583 vol->cluster_size_mask;
1584
1585 is_retry = FALSE;
1586 if (!rl) {
1587lock_retry_remap:
1588 down_read(&ni->runlist.lock);
1589 rl = ni->runlist.rl;
1590 }
1591 if (likely(rl != NULL)) {
1592 /* Seek to element containing target vcn. */
1593 while (rl->length && rl[1].vcn <= vcn)
1594 rl++;
1595 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
1596 } else
1597 lcn = LCN_RL_NOT_MAPPED;
1598 if (unlikely(lcn < 0)) {
1599 /*
1600 * We extended the attribute allocation above.
1601 * If we hit an ENOENT here it means that the
1602 * allocation was insufficient which is a bug.
1603 */
1604 BUG_ON(lcn == LCN_ENOENT);
1605
1606 /* It is a hole, need to instantiate it. */
1607 if (lcn == LCN_HOLE) {
1608 // TODO: Instantiate the hole.
1609 // clear_buffer_new(bh);
1610 // unmap_underlying_metadata(bh->b_bdev,
1611 // bh->b_blocknr);
1612 // For non-uptodate buffers, need to
1613 // zero out the region outside the
1614 // request in this bh or all bhs,
1615 // depending on what we implemented
1616 // above.
1617 // Need to flush_dcache_page().
1618 // Or could use set_buffer_new()
1619 // instead?
1620 ntfs_error(vol->sb, "Writing into "
1621 "sparse regions is "
1622 "not supported yet. "
1623 "Sorry.");
1624 err = -EOPNOTSUPP;
1625 goto err_out;
1626 } else if (!is_retry &&
1627 lcn == LCN_RL_NOT_MAPPED) {
1628 is_retry = TRUE;
1629 /*
1630 * Attempt to map runlist, dropping
1631 * lock for the duration.
1632 */
1633 up_read(&ni->runlist.lock);
1634 err = ntfs_map_runlist(ni, vcn);
1635 if (likely(!err))
1636 goto lock_retry_remap;
1637 rl = NULL;
1638 lcn = err;
1639 }
1640 /*
1641 * Failed to map the buffer, even after
1642 * retrying.
1643 */
1644 bh->b_blocknr = -1;
1645 ntfs_error(vol->sb, "Failed to write to inode "
1646 "0x%lx, attribute type 0x%x, "
1647 "vcn 0x%llx, offset 0x%x "
1648 "because its location on disk "
1649 "could not be determined%s "
1650 "(error code %lli).",
1651 ni->mft_no, ni->type,
1652 (unsigned long long)vcn,
1653 vcn_ofs, is_retry ? " even "
1654 "after retrying" : "",
1655 (long long)lcn);
1656 if (!err)
1657 err = -EIO;
1658 goto err_out;
1659 }
1660 /* We now have a successful remap, i.e. lcn >= 0. */
1661
1662 /* Setup buffer head to correct block. */
1663 bh->b_blocknr = ((lcn << vol->cluster_size_bits)
1664 + vcn_ofs) >> blocksize_bits;
1665 set_buffer_mapped(bh);
1666
1667 // FIXME: Something analogous to this is needed for
1668 // each newly allocated block, i.e. BH_New.
1669 // FIXME: Might need to take this out of the
1670 // if (!buffer_mapped(bh)) {}, depending on how we
1671 // implement things during the allocated_size and
1672 // initialized_size extension code above.
1673 if (buffer_new(bh)) {
1674 clear_buffer_new(bh);
1675 unmap_underlying_metadata(bh->b_bdev,
1676 bh->b_blocknr);
1677 if (PageUptodate(page)) {
1678 set_buffer_uptodate(bh);
1679 continue;
1680 }
1681 /*
1682 * Page is _not_ uptodate, zero surrounding
1683 * region. NOTE: This is how we decide if to
1684 * zero or not!
1685 */
1686 if (block_end > to || block_start < from) {
1687 void *kaddr;
1688
1689 kaddr = kmap_atomic(page, KM_USER0);
1690 if (block_end > to)
1691 memset(kaddr + to, 0,
1692 block_end - to);
1693 if (block_start < from)
1694 memset(kaddr + block_start, 0,
1695 from -
1696 block_start);
1697 flush_dcache_page(page);
1698 kunmap_atomic(kaddr, KM_USER0);
1699 }
1700 continue;
1701 }
1702 }
1703 /* @bh is mapped, set it uptodate if the page is uptodate. */
1704 if (PageUptodate(page)) {
1705 if (!buffer_uptodate(bh))
1706 set_buffer_uptodate(bh);
1707 continue;
1708 }
1709 /*
1710 * The page is not uptodate. The buffer is mapped. If it is not
1711 * uptodate, and it is only partially being written to, we need
1712 * to read the buffer in before the write, i.e. right now.
1713 */
1714 if (!buffer_uptodate(bh) &&
1715 (block_start < from || block_end > to)) {
1716 ll_rw_block(READ, 1, &bh);
1717 *wait_bh++ = bh;
1718 }
1719 } while (block++, block_start = block_end,
1720 (bh = bh->b_this_page) != head);
1721
1722 /* Release the lock if we took it. */
1723 if (rl) {
1724 up_read(&ni->runlist.lock);
1725 rl = NULL;
1726 }
1727
1728 /* If we issued read requests, let them complete. */
1729 while (wait_bh > wait) {
1730 wait_on_buffer(*--wait_bh);
1731 if (!buffer_uptodate(*wait_bh))
1732 return -EIO;
1733 }
1734
1735 ntfs_debug("Done.");
1736 return 0;
1737err_out:
1738 /*
1739 * Zero out any newly allocated blocks to avoid exposing stale data.
1740 * If BH_New is set, we know that the block was newly allocated in the
1741 * above loop.
1742 * FIXME: What about initialized_size increments? Have we done all the
1743 * required zeroing above? If not this error handling is broken, and
1744 * in particular the if (block_end <= from) check is completely bogus.
1745 */
1746 bh = head;
1747 block_start = 0;
1748 is_retry = FALSE;
1749 do {
1750 block_end = block_start + blocksize;
1751 if (block_end <= from)
1752 continue;
1753 if (block_start >= to)
1754 break;
1755 if (buffer_new(bh)) {
1756 void *kaddr;
1757
1758 clear_buffer_new(bh);
1759 kaddr = kmap_atomic(page, KM_USER0);
1760 memset(kaddr + block_start, 0, bh->b_size);
1761 kunmap_atomic(kaddr, KM_USER0);
1762 set_buffer_uptodate(bh);
1763 mark_buffer_dirty(bh);
1764 is_retry = TRUE;
1765 }
1766 } while (block_start = block_end, (bh = bh->b_this_page) != head);
1767 if (is_retry)
1768 flush_dcache_page(page);
1769 if (rl)
1770 up_read(&ni->runlist.lock);
1771 return err;
1772}
1773
1774/**
1775 * ntfs_prepare_write - prepare a page for receiving data
1776 *
1777 * This is called from generic_file_write() with i_sem held on the inode
1778 * (@page->mapping->host). The @page is locked but not kmap()ped. The source
1779 * data has not yet been copied into the @page.
1780 *
1781 * Need to extend the attribute/fill in holes if necessary, create blocks and
1782 * make partially overwritten blocks uptodate,
1783 *
1784 * i_size is not to be modified yet.
1785 *
1786 * Return 0 on success or -errno on error.
1787 *
1788 * Should be using block_prepare_write() [support for sparse files] or
1789 * cont_prepare_write() [no support for sparse files]. Cannot do that due to
1790 * ntfs specifics but can look at them for implementation guidance.
1791 *
1792 * Note: In the range, @from is inclusive and @to is exclusive, i.e. @from is
1793 * the first byte in the page that will be written to and @to is the first byte
1794 * after the last byte that will be written to.
1795 */
1796static int ntfs_prepare_write(struct file *file, struct page *page,
1797 unsigned from, unsigned to)
1798{
1799 s64 new_size;
1800 struct inode *vi = page->mapping->host;
1801 ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
1802 ntfs_volume *vol = ni->vol;
1803 ntfs_attr_search_ctx *ctx = NULL;
1804 MFT_RECORD *m = NULL;
1805 ATTR_RECORD *a;
1806 u8 *kaddr;
1807 u32 attr_len;
1808 int err;
1809
1810 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1811 "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
1812 page->index, from, to);
1813 BUG_ON(!PageLocked(page));
1814 BUG_ON(from > PAGE_CACHE_SIZE);
1815 BUG_ON(to > PAGE_CACHE_SIZE);
1816 BUG_ON(from > to);
1817 BUG_ON(NInoMstProtected(ni));
1818 /*
1819 * If a previous ntfs_truncate() failed, repeat it and abort if it
1820 * fails again.
1821 */
1822 if (unlikely(NInoTruncateFailed(ni))) {
1823 down_write(&vi->i_alloc_sem);
1824 err = ntfs_truncate(vi);
1825 up_write(&vi->i_alloc_sem);
1826 if (err || NInoTruncateFailed(ni)) {
1827 if (!err)
1828 err = -EIO;
1829 goto err_out;
1830 }
1831 }
1832 /* If the attribute is not resident, deal with it elsewhere. */
1833 if (NInoNonResident(ni)) {
1834 /*
1835 * Only unnamed $DATA attributes can be compressed, encrypted,
1836 * and/or sparse.
1837 */
1838 if (ni->type == AT_DATA && !ni->name_len) {
1839 /* If file is encrypted, deny access, just like NT4. */
1840 if (NInoEncrypted(ni)) {
1841 ntfs_debug("Denying write access to encrypted "
1842 "file.");
1843 return -EACCES;
1844 }
1845 /* Compressed data streams are handled in compress.c. */
1846 if (NInoCompressed(ni)) {
1847 // TODO: Implement and replace this check with
1848 // return ntfs_write_compressed_block(page);
1849 ntfs_error(vi->i_sb, "Writing to compressed "
1850 "files is not supported yet. "
1851 "Sorry.");
1852 return -EOPNOTSUPP;
1853 }
1854 // TODO: Implement and remove this check.
1855 if (NInoSparse(ni)) {
1856 ntfs_error(vi->i_sb, "Writing to sparse files "
1857 "is not supported yet. Sorry.");
1858 return -EOPNOTSUPP;
1859 }
1860 }
1861 /* Normal data stream. */
1862 return ntfs_prepare_nonresident_write(page, from, to);
1863 }
1864 /*
1865 * Attribute is resident, implying it is not compressed, encrypted, or
1866 * sparse.
1867 */
1868 BUG_ON(page_has_buffers(page));
1869 new_size = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
1870 /* If we do not need to resize the attribute allocation we are done. */
1871 if (new_size <= vi->i_size)
1872 goto done;
1873
1874 // FIXME: We abort for now as this code is not safe.
1875 ntfs_error(vi->i_sb, "Changing the file size is not supported yet. "
1876 "Sorry.");
1877 return -EOPNOTSUPP;
1878
1879 /* Map, pin, and lock the (base) mft record. */
1880 if (!NInoAttr(ni))
1881 base_ni = ni;
1882 else
1883 base_ni = ni->ext.base_ntfs_ino;
1884 m = map_mft_record(base_ni);
1885 if (IS_ERR(m)) {
1886 err = PTR_ERR(m);
1887 m = NULL;
1888 ctx = NULL;
1889 goto err_out;
1890 }
1891 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1892 if (unlikely(!ctx)) {
1893 err = -ENOMEM;
1894 goto err_out;
1895 }
1896 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1897 CASE_SENSITIVE, 0, NULL, 0, ctx);
1898 if (unlikely(err)) {
1899 if (err == -ENOENT)
1900 err = -EIO;
1901 goto err_out;
1902 }
1903 m = ctx->mrec;
1904 a = ctx->attr;
1905 /* The total length of the attribute value. */
1906 attr_len = le32_to_cpu(a->data.resident.value_length);
1907 BUG_ON(vi->i_size != attr_len);
1908 /* Check if new size is allowed in $AttrDef. */
1909 err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
1910 if (unlikely(err)) {
1911 if (err == -ERANGE) {
1912 ntfs_error(vol->sb, "Write would cause the inode "
1913 "0x%lx to exceed the maximum size for "
1914 "its attribute type (0x%x). Aborting "
1915 "write.", vi->i_ino,
1916 le32_to_cpu(ni->type));
1917 } else {
1918 ntfs_error(vol->sb, "Inode 0x%lx has unknown "
1919 "attribute type 0x%x. Aborting "
1920 "write.", vi->i_ino,
1921 le32_to_cpu(ni->type));
1922 err = -EIO;
1923 }
1924 goto err_out2;
1925 }
1926 /*
1927 * Extend the attribute record to be able to store the new attribute
1928 * size.
1929 */
1930 if (new_size >= vol->mft_record_size || ntfs_attr_record_resize(m, a,
1931 le16_to_cpu(a->data.resident.value_offset) +
1932 new_size)) {
1933 /* Not enough space in the mft record. */
1934 ntfs_error(vol->sb, "Not enough space in the mft record for "
1935 "the resized attribute value. This is not "
1936 "supported yet. Aborting write.");
1937 err = -EOPNOTSUPP;
1938 goto err_out2;
1939 }
1940 /*
1941 * We have enough space in the mft record to fit the write. This
1942 * implies the attribute is smaller than the mft record and hence the
1943 * attribute must be in a single page and hence page->index must be 0.
1944 */
1945 BUG_ON(page->index);
1946 /*
1947 * If the beginning of the write is past the old size, enlarge the
1948 * attribute value up to the beginning of the write and fill it with
1949 * zeroes.
1950 */
1951 if (from > attr_len) {
1952 memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
1953 attr_len, 0, from - attr_len);
1954 a->data.resident.value_length = cpu_to_le32(from);
1955 /* Zero the corresponding area in the page as well. */
1956 if (PageUptodate(page)) {
1957 kaddr = kmap_atomic(page, KM_USER0);
1958 memset(kaddr + attr_len, 0, from - attr_len);
1959 kunmap_atomic(kaddr, KM_USER0);
1960 flush_dcache_page(page);
1961 }
1962 }
1963 flush_dcache_mft_record_page(ctx->ntfs_ino);
1964 mark_mft_record_dirty(ctx->ntfs_ino);
1965 ntfs_attr_put_search_ctx(ctx);
1966 unmap_mft_record(base_ni);
1967 /*
1968 * Because resident attributes are handled by memcpy() to/from the
1969 * corresponding MFT record, and because this form of i/o is byte
1970 * aligned rather than block aligned, there is no need to bring the
1971 * page uptodate here as in the non-resident case where we need to
1972 * bring the buffers straddled by the write uptodate before
1973 * generic_file_write() does the copying from userspace.
1974 *
1975 * We thus defer the uptodate bringing of the page region outside the
1976 * region written to to ntfs_commit_write(), which makes the code
1977 * simpler and saves one atomic kmap which is good.
1978 */
1979done:
1980 ntfs_debug("Done.");
1981 return 0;
1982err_out:
1983 if (err == -ENOMEM)
1984 ntfs_warning(vi->i_sb, "Error allocating memory required to "
1985 "prepare the write.");
1986 else {
1987 ntfs_error(vi->i_sb, "Resident attribute prepare write failed "
1988 "with error %i.", err);
1989 NVolSetErrors(vol);
1990 make_bad_inode(vi);
1991 }
1992err_out2:
1993 if (ctx)
1994 ntfs_attr_put_search_ctx(ctx);
1995 if (m)
1996 unmap_mft_record(base_ni);
1997 return err;
1998}
1999
2000/**
2001 * ntfs_commit_nonresident_write -
2002 *
2003 */
2004static int ntfs_commit_nonresident_write(struct page *page,
2005 unsigned from, unsigned to)
2006{
2007 s64 pos = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
2008 struct inode *vi = page->mapping->host;
2009 struct buffer_head *bh, *head;
2010 unsigned int block_start, block_end, blocksize;
2011 BOOL partial;
2012
2013 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
2014 "0x%lx, from = %u, to = %u.", vi->i_ino,
2015 NTFS_I(vi)->type, page->index, from, to);
2016 blocksize = 1 << vi->i_blkbits;
2017
2018 // FIXME: We need a whole slew of special cases in here for compressed
2019 // files for example...
2020 // For now, we know ntfs_prepare_write() would have failed so we can't
2021 // get here in any of the cases which we have to special case, so we
2022 // are just a ripped off, unrolled generic_commit_write().
2023
2024 bh = head = page_buffers(page);
2025 block_start = 0;
2026 partial = FALSE;
2027 do {
2028 block_end = block_start + blocksize;
2029 if (block_end <= from || block_start >= to) {
2030 if (!buffer_uptodate(bh))
2031 partial = TRUE;
2032 } else {
2033 set_buffer_uptodate(bh);
2034 mark_buffer_dirty(bh);
2035 }
2036 } while (block_start = block_end, (bh = bh->b_this_page) != head);
2037 /*
2038 * If this is a partial write which happened to make all buffers
2039 * uptodate then we can optimize away a bogus ->readpage() for the next
2040 * read(). Here we 'discover' whether the page went uptodate as a
2041 * result of this (potentially partial) write.
2042 */
2043 if (!partial)
2044 SetPageUptodate(page);
2045 /*
2046 * Not convinced about this at all. See disparity comment above. For
2047 * now we know ntfs_prepare_write() would have failed in the write
2048 * exceeds i_size case, so this will never trigger which is fine.
2049 */
2050 if (pos > vi->i_size) {
2051 ntfs_error(vi->i_sb, "Writing beyond the existing file size is "
2052 "not supported yet. Sorry.");
2053 return -EOPNOTSUPP;
2054 // vi->i_size = pos;
2055 // mark_inode_dirty(vi);
2056 }
2057 ntfs_debug("Done.");
2058 return 0;
2059}
2060
2061/**
2062 * ntfs_commit_write - commit the received data
2063 *
2064 * This is called from generic_file_write() with i_sem held on the inode
2065 * (@page->mapping->host). The @page is locked but not kmap()ped. The source
2066 * data has already been copied into the @page. ntfs_prepare_write() has been
2067 * called before the data copied and it returned success so we can take the
2068 * results of various BUG checks and some error handling for granted.
2069 *
2070 * Need to mark modified blocks dirty so they get written out later when
2071 * ntfs_writepage() is invoked by the VM.
2072 *
2073 * Return 0 on success or -errno on error.
2074 *
2075 * Should be using generic_commit_write(). This marks buffers uptodate and
2076 * dirty, sets the page uptodate if all buffers in the page are uptodate, and
2077 * updates i_size if the end of io is beyond i_size. In that case, it also
2078 * marks the inode dirty.
2079 *
2080 * Cannot use generic_commit_write() due to ntfs specialities but can look at
2081 * it for implementation guidance.
2082 *
2083 * If things have gone as outlined in ntfs_prepare_write(), then we do not
2084 * need to do any page content modifications here at all, except in the write
2085 * to resident attribute case, where we need to do the uptodate bringing here
2086 * which we combine with the copying into the mft record which means we save
2087 * one atomic kmap.
2088 */
2089static int ntfs_commit_write(struct file *file, struct page *page,
2090 unsigned from, unsigned to)
2091{
2092 struct inode *vi = page->mapping->host;
2093 ntfs_inode *base_ni, *ni = NTFS_I(vi);
2094 char *kaddr, *kattr;
2095 ntfs_attr_search_ctx *ctx;
2096 MFT_RECORD *m;
2097 ATTR_RECORD *a;
2098 u32 attr_len;
2099 int err;
2100
2101 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
2102 "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
2103 page->index, from, to);
2104 /* If the attribute is not resident, deal with it elsewhere. */
2105 if (NInoNonResident(ni)) {
2106 /* Only unnamed $DATA attributes can be compressed/encrypted. */
2107 if (ni->type == AT_DATA && !ni->name_len) {
2108 /* Encrypted files need separate handling. */
2109 if (NInoEncrypted(ni)) {
2110 // We never get here at present!
2111 BUG();
2112 }
2113 /* Compressed data streams are handled in compress.c. */
2114 if (NInoCompressed(ni)) {
2115 // TODO: Implement this!
2116 // return ntfs_write_compressed_block(page);
2117 // We never get here at present!
2118 BUG();
2119 }
2120 }
2121 /* Normal data stream. */
2122 return ntfs_commit_nonresident_write(page, from, to);
2123 }
2124 /*
2125 * Attribute is resident, implying it is not compressed, encrypted, or
2126 * sparse.
2127 */
2128 if (!NInoAttr(ni))
2129 base_ni = ni;
2130 else
2131 base_ni = ni->ext.base_ntfs_ino;
2132 /* Map, pin, and lock the mft record. */
2133 m = map_mft_record(base_ni);
2134 if (IS_ERR(m)) {
2135 err = PTR_ERR(m);
2136 m = NULL;
2137 ctx = NULL;
2138 goto err_out;
2139 }
2140 ctx = ntfs_attr_get_search_ctx(base_ni, m);
2141 if (unlikely(!ctx)) {
2142 err = -ENOMEM;
2143 goto err_out;
2144 }
2145 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
2146 CASE_SENSITIVE, 0, NULL, 0, ctx);
2147 if (unlikely(err)) {
2148 if (err == -ENOENT)
2149 err = -EIO;
2150 goto err_out;
2151 }
2152 a = ctx->attr;
2153 /* The total length of the attribute value. */
2154 attr_len = le32_to_cpu(a->data.resident.value_length);
2155 BUG_ON(from > attr_len);
2156 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
2157 kaddr = kmap_atomic(page, KM_USER0);
2158 /* Copy the received data from the page to the mft record. */
2159 memcpy(kattr + from, kaddr + from, to - from);
2160 /* Update the attribute length if necessary. */
2161 if (to > attr_len) {
2162 attr_len = to;
2163 a->data.resident.value_length = cpu_to_le32(attr_len);
2164 }
2165 /*
2166 * If the page is not uptodate, bring the out of bounds area(s)
2167 * uptodate by copying data from the mft record to the page.
2168 */
2169 if (!PageUptodate(page)) {
2170 if (from > 0)
2171 memcpy(kaddr, kattr, from);
2172 if (to < attr_len)
2173 memcpy(kaddr + to, kattr + to, attr_len - to);
2174 /* Zero the region outside the end of the attribute value. */
2175 if (attr_len < PAGE_CACHE_SIZE)
2176 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
2177 /*
2178 * The probability of not having done any of the above is
2179 * extremely small, so we just flush unconditionally.
2180 */
2181 flush_dcache_page(page);
2182 SetPageUptodate(page);
2183 }
2184 kunmap_atomic(kaddr, KM_USER0);
2185 /* Update i_size if necessary. */
2186 if (vi->i_size < attr_len) {
2187 ni->allocated_size = ni->initialized_size = attr_len;
2188 i_size_write(vi, attr_len);
2189 }
2190 /* Mark the mft record dirty, so it gets written back. */
2191 flush_dcache_mft_record_page(ctx->ntfs_ino);
2192 mark_mft_record_dirty(ctx->ntfs_ino);
2193 ntfs_attr_put_search_ctx(ctx);
2194 unmap_mft_record(base_ni);
2195 ntfs_debug("Done.");
2196 return 0;
2197err_out:
2198 if (err == -ENOMEM) {
2199 ntfs_warning(vi->i_sb, "Error allocating memory required to "
2200 "commit the write.");
2201 if (PageUptodate(page)) {
2202 ntfs_warning(vi->i_sb, "Page is uptodate, setting "
2203 "dirty so the write will be retried "
2204 "later on by the VM.");
2205 /*
2206 * Put the page on mapping->dirty_pages, but leave its
2207 * buffers' dirty state as-is.
2208 */
2209 __set_page_dirty_nobuffers(page);
2210 err = 0;
2211 } else
2212 ntfs_error(vi->i_sb, "Page is not uptodate. Written "
2213 "data has been lost.");
2214 } else {
2215 ntfs_error(vi->i_sb, "Resident attribute commit write failed "
2216 "with error %i.", err);
2217 NVolSetErrors(ni->vol);
2218 make_bad_inode(vi);
2219 }
2220 if (ctx)
2221 ntfs_attr_put_search_ctx(ctx);
2222 if (m)
2223 unmap_mft_record(base_ni);
2224 return err;
2225}
2226
2227#endif /* NTFS_RW */
2228
2229/**
2230 * ntfs_aops - general address space operations for inodes and attributes
2231 */
2232struct address_space_operations ntfs_aops = {
2233 .readpage = ntfs_readpage, /* Fill page with data. */
2234 .sync_page = block_sync_page, /* Currently, just unplugs the
2235 disk request queue. */
2236#ifdef NTFS_RW
2237 .writepage = ntfs_writepage, /* Write dirty page to disk. */
2238 .prepare_write = ntfs_prepare_write, /* Prepare page and buffers
2239 ready to receive data. */
2240 .commit_write = ntfs_commit_write, /* Commit received data. */
2241#endif /* NTFS_RW */
2242};
2243
2244/**
2245 * ntfs_mst_aops - general address space operations for mst protecteed inodes
2246 * and attributes
2247 */
2248struct address_space_operations ntfs_mst_aops = {
2249 .readpage = ntfs_readpage, /* Fill page with data. */
2250 .sync_page = block_sync_page, /* Currently, just unplugs the
2251 disk request queue. */
2252#ifdef NTFS_RW
2253 .writepage = ntfs_writepage, /* Write dirty page to disk. */
2254 .set_page_dirty = __set_page_dirty_nobuffers, /* Set the page dirty
2255 without touching the buffers
2256 belonging to the page. */
2257#endif /* NTFS_RW */
2258};
2259
2260#ifdef NTFS_RW
2261
2262/**
2263 * mark_ntfs_record_dirty - mark an ntfs record dirty
2264 * @page: page containing the ntfs record to mark dirty
2265 * @ofs: byte offset within @page at which the ntfs record begins
2266 *
2267 * Set the buffers and the page in which the ntfs record is located dirty.
2268 *
2269 * The latter also marks the vfs inode the ntfs record belongs to dirty
2270 * (I_DIRTY_PAGES only).
2271 *
2272 * If the page does not have buffers, we create them and set them uptodate.
2273 * The page may not be locked which is why we need to handle the buffers under
2274 * the mapping->private_lock. Once the buffers are marked dirty we no longer
2275 * need the lock since try_to_free_buffers() does not free dirty buffers.
2276 */
2277void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
2278 struct address_space *mapping = page->mapping;
2279 ntfs_inode *ni = NTFS_I(mapping->host);
2280 struct buffer_head *bh, *head, *buffers_to_free = NULL;
2281 unsigned int end, bh_size, bh_ofs;
2282
2283 BUG_ON(!PageUptodate(page));
2284 end = ofs + ni->itype.index.block_size;
2285 bh_size = 1 << VFS_I(ni)->i_blkbits;
2286 spin_lock(&mapping->private_lock);
2287 if (unlikely(!page_has_buffers(page))) {
2288 spin_unlock(&mapping->private_lock);
2289 bh = head = alloc_page_buffers(page, bh_size, 1);
2290 spin_lock(&mapping->private_lock);
2291 if (likely(!page_has_buffers(page))) {
2292 struct buffer_head *tail;
2293
2294 do {
2295 set_buffer_uptodate(bh);
2296 tail = bh;
2297 bh = bh->b_this_page;
2298 } while (bh);
2299 tail->b_this_page = head;
2300 attach_page_buffers(page, head);
2301 } else
2302 buffers_to_free = bh;
2303 }
2304 bh = head = page_buffers(page);
2305 do {
2306 bh_ofs = bh_offset(bh);
2307 if (bh_ofs + bh_size <= ofs)
2308 continue;
2309 if (unlikely(bh_ofs >= end))
2310 break;
2311 set_buffer_dirty(bh);
2312 } while ((bh = bh->b_this_page) != head);
2313 spin_unlock(&mapping->private_lock);
2314 __set_page_dirty_nobuffers(page);
2315 if (unlikely(buffers_to_free)) {
2316 do {
2317 bh = buffers_to_free->b_this_page;
2318 free_buffer_head(buffers_to_free);
2319 buffers_to_free = bh;
2320 } while (buffers_to_free);
2321 }
2322}
2323
2324#endif /* NTFS_RW */
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
new file mode 100644
index 000000000000..3b74e66ca2ff
--- /dev/null
+++ b/fs/ntfs/aops.h
@@ -0,0 +1,109 @@
1/**
2 * aops.h - Defines for NTFS kernel address space operations and page cache
3 * handling. Part of the Linux-NTFS project.
4 *
5 * Copyright (c) 2001-2004 Anton Altaparmakov
6 * Copyright (c) 2002 Richard Russon
7 *
8 * This program/include file is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as published
10 * by the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program/include file is distributed in the hope that it will be
14 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program (in the main directory of the Linux-NTFS
20 * distribution in the file COPYING); if not, write to the Free Software
21 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24#ifndef _LINUX_NTFS_AOPS_H
25#define _LINUX_NTFS_AOPS_H
26
27#include <linux/mm.h>
28#include <linux/highmem.h>
29#include <linux/pagemap.h>
30#include <linux/fs.h>
31
32#include "inode.h"
33
34/**
35 * ntfs_unmap_page - release a page that was mapped using ntfs_map_page()
36 * @page: the page to release
37 *
38 * Unpin, unmap and release a page that was obtained from ntfs_map_page().
39 */
40static inline void ntfs_unmap_page(struct page *page)
41{
42 kunmap(page);
43 page_cache_release(page);
44}
45
46/**
47 * ntfs_map_page - map a page into accessible memory, reading it if necessary
48 * @mapping: address space for which to obtain the page
49 * @index: index into the page cache for @mapping of the page to map
50 *
51 * Read a page from the page cache of the address space @mapping at position
52 * @index, where @index is in units of PAGE_CACHE_SIZE, and not in bytes.
53 *
54 * If the page is not in memory it is loaded from disk first using the readpage
55 * method defined in the address space operations of @mapping and the page is
56 * added to the page cache of @mapping in the process.
57 *
58 * If the page belongs to an mst protected attribute and it is marked as such
59 * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no
60 * error checking is performed. This means the caller has to verify whether
61 * the ntfs record(s) contained in the page are valid or not using one of the
62 * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are
63 * expecting to see. (For details of the macros, see fs/ntfs/layout.h.)
64 *
65 * If the page is in high memory it is mapped into memory directly addressible
66 * by the kernel.
67 *
68 * Finally the page count is incremented, thus pinning the page into place.
69 *
70 * The above means that page_address(page) can be used on all pages obtained
71 * with ntfs_map_page() to get the kernel virtual address of the page.
72 *
73 * When finished with the page, the caller has to call ntfs_unmap_page() to
74 * unpin, unmap and release the page.
75 *
76 * Note this does not grant exclusive access. If such is desired, the caller
77 * must provide it independently of the ntfs_{un}map_page() calls by using
78 * a {rw_}semaphore or other means of serialization. A spin lock cannot be
79 * used as ntfs_map_page() can block.
80 *
81 * The unlocked and uptodate page is returned on success or an encoded error
82 * on failure. Caller has to test for error using the IS_ERR() macro on the
83 * return value. If that evaluates to TRUE, the negative error code can be
84 * obtained using PTR_ERR() on the return value of ntfs_map_page().
85 */
86static inline struct page *ntfs_map_page(struct address_space *mapping,
87 unsigned long index)
88{
89 struct page *page = read_cache_page(mapping, index,
90 (filler_t*)mapping->a_ops->readpage, NULL);
91
92 if (!IS_ERR(page)) {
93 wait_on_page_locked(page);
94 kmap(page);
95 if (PageUptodate(page) && !PageError(page))
96 return page;
97 ntfs_unmap_page(page);
98 return ERR_PTR(-EIO);
99 }
100 return page;
101}
102
103#ifdef NTFS_RW
104
105extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs);
106
107#endif /* NTFS_RW */
108
109#endif /* _LINUX_NTFS_AOPS_H */
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
new file mode 100644
index 000000000000..1ff7f90a18b0
--- /dev/null
+++ b/fs/ntfs/attrib.c
@@ -0,0 +1,1258 @@
1/**
2 * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2001-2004 Anton Altaparmakov
5 * Copyright (c) 2002 Richard Russon
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#include <linux/buffer_head.h>
24
25#include "attrib.h"
26#include "debug.h"
27#include "layout.h"
28#include "mft.h"
29#include "ntfs.h"
30#include "types.h"
31
32/**
33 * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode
34 * @ni: ntfs inode for which to map (part of) a runlist
35 * @vcn: map runlist part containing this vcn
36 *
37 * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
38 *
39 * Return 0 on success and -errno on error.
40 *
41 * Locking: - The runlist must be unlocked on entry and is unlocked on return.
42 * - This function takes the lock for writing and modifies the runlist.
43 */
44int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
45{
46 ntfs_inode *base_ni;
47 ntfs_attr_search_ctx *ctx;
48 MFT_RECORD *mrec;
49 int err = 0;
50
51 ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
52 (unsigned long long)vcn);
53
54 if (!NInoAttr(ni))
55 base_ni = ni;
56 else
57 base_ni = ni->ext.base_ntfs_ino;
58
59 mrec = map_mft_record(base_ni);
60 if (IS_ERR(mrec))
61 return PTR_ERR(mrec);
62 ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
63 if (unlikely(!ctx)) {
64 err = -ENOMEM;
65 goto err_out;
66 }
67 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
68 CASE_SENSITIVE, vcn, NULL, 0, ctx);
69 if (unlikely(err))
70 goto put_err_out;
71
72 down_write(&ni->runlist.lock);
73 /* Make sure someone else didn't do the work while we were sleeping. */
74 if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <=
75 LCN_RL_NOT_MAPPED)) {
76 runlist_element *rl;
77
78 rl = ntfs_mapping_pairs_decompress(ni->vol, ctx->attr,
79 ni->runlist.rl);
80 if (IS_ERR(rl))
81 err = PTR_ERR(rl);
82 else
83 ni->runlist.rl = rl;
84 }
85 up_write(&ni->runlist.lock);
86
87put_err_out:
88 ntfs_attr_put_search_ctx(ctx);
89err_out:
90 unmap_mft_record(base_ni);
91 return err;
92}
93
94/**
95 * ntfs_find_vcn - find a vcn in the runlist described by an ntfs inode
96 * @ni: ntfs inode describing the runlist to search
97 * @vcn: vcn to find
98 * @need_write: if false, lock for reading and if true, lock for writing
99 *
100 * Find the virtual cluster number @vcn in the runlist described by the ntfs
101 * inode @ni and return the address of the runlist element containing the @vcn.
102 * The runlist is left locked and the caller has to unlock it. If @need_write
103 * is true, the runlist is locked for writing and if @need_write is false, the
104 * runlist is locked for reading. In the error case, the runlist is not left
105 * locked.
106 *
107 * Note you need to distinguish between the lcn of the returned runlist element
108 * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on
109 * read and allocate clusters on write.
110 *
111 * Return the runlist element containing the @vcn on success and
112 * ERR_PTR(-errno) on error. You need to test the return value with IS_ERR()
113 * to decide if the return is success or failure and PTR_ERR() to get to the
114 * error code if IS_ERR() is true.
115 *
116 * The possible error return codes are:
117 * -ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds.
118 * -ENOMEM - Not enough memory to map runlist.
119 * -EIO - Critical error (runlist/file is corrupt, i/o error, etc).
120 *
121 * Locking: - The runlist must be unlocked on entry.
122 * - On failing return, the runlist is unlocked.
123 * - On successful return, the runlist is locked. If @need_write us
124 * true, it is locked for writing. Otherwise is is locked for
125 * reading.
126 */
127runlist_element *ntfs_find_vcn(ntfs_inode *ni, const VCN vcn,
128 const BOOL need_write)
129{
130 runlist_element *rl;
131 int err = 0;
132 BOOL is_retry = FALSE;
133
134 ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, lock for %sing.",
135 ni->mft_no, (unsigned long long)vcn,
136 !need_write ? "read" : "writ");
137 BUG_ON(!ni);
138 BUG_ON(!NInoNonResident(ni));
139 BUG_ON(vcn < 0);
140lock_retry_remap:
141 if (!need_write)
142 down_read(&ni->runlist.lock);
143 else
144 down_write(&ni->runlist.lock);
145 rl = ni->runlist.rl;
146 if (likely(rl && vcn >= rl[0].vcn)) {
147 while (likely(rl->length)) {
148 if (likely(vcn < rl[1].vcn)) {
149 if (likely(rl->lcn >= LCN_HOLE)) {
150 ntfs_debug("Done.");
151 return rl;
152 }
153 break;
154 }
155 rl++;
156 }
157 if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) {
158 if (likely(rl->lcn == LCN_ENOENT))
159 err = -ENOENT;
160 else
161 err = -EIO;
162 }
163 }
164 if (!need_write)
165 up_read(&ni->runlist.lock);
166 else
167 up_write(&ni->runlist.lock);
168 if (!err && !is_retry) {
169 /*
170 * The @vcn is in an unmapped region, map the runlist and
171 * retry.
172 */
173 err = ntfs_map_runlist(ni, vcn);
174 if (likely(!err)) {
175 is_retry = TRUE;
176 goto lock_retry_remap;
177 }
178 /*
179 * -EINVAL and -ENOENT coming from a failed mapping attempt are
180 * equivalent to i/o errors for us as they should not happen in
181 * our code paths.
182 */
183 if (err == -EINVAL || err == -ENOENT)
184 err = -EIO;
185 } else if (!err)
186 err = -EIO;
187 ntfs_error(ni->vol->sb, "Failed with error code %i.", err);
188 return ERR_PTR(err);
189}
190
191/**
192 * ntfs_attr_find - find (next) attribute in mft record
193 * @type: attribute type to find
194 * @name: attribute name to find (optional, i.e. NULL means don't care)
195 * @name_len: attribute name length (only needed if @name present)
196 * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
197 * @val: attribute value to find (optional, resident attributes only)
198 * @val_len: attribute value length
199 * @ctx: search context with mft record and attribute to search from
200 *
201 * You should not need to call this function directly. Use ntfs_attr_lookup()
202 * instead.
203 *
204 * ntfs_attr_find() takes a search context @ctx as parameter and searches the
205 * mft record specified by @ctx->mrec, beginning at @ctx->attr, for an
206 * attribute of @type, optionally @name and @val.
207 *
208 * If the attribute is found, ntfs_attr_find() returns 0 and @ctx->attr will
209 * point to the found attribute.
210 *
211 * If the attribute is not found, ntfs_attr_find() returns -ENOENT and
212 * @ctx->attr will point to the attribute before which the attribute being
213 * searched for would need to be inserted if such an action were to be desired.
214 *
215 * On actual error, ntfs_attr_find() returns -EIO. In this case @ctx->attr is
216 * undefined and in particular do not rely on it not changing.
217 *
218 * If @ctx->is_first is TRUE, the search begins with @ctx->attr itself. If it
219 * is FALSE, the search begins after @ctx->attr.
220 *
221 * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and
222 * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record
223 * @ctx->mrec belongs. This is so we can get at the ntfs volume and hence at
224 * the upcase table. If @ic is CASE_SENSITIVE, the comparison is case
225 * sensitive. When @name is present, @name_len is the @name length in Unicode
226 * characters.
227 *
228 * If @name is not present (NULL), we assume that the unnamed attribute is
229 * being searched for.
230 *
231 * Finally, the resident attribute value @val is looked for, if present. If
232 * @val is not present (NULL), @val_len is ignored.
233 *
234 * ntfs_attr_find() only searches the specified mft record and it ignores the
235 * presence of an attribute list attribute (unless it is the one being searched
236 * for, obviously). If you need to take attribute lists into consideration,
237 * use ntfs_attr_lookup() instead (see below). This also means that you cannot
238 * use ntfs_attr_find() to search for extent records of non-resident
239 * attributes, as extents with lowest_vcn != 0 are usually described by the
240 * attribute list attribute only. - Note that it is possible that the first
241 * extent is only in the attribute list while the last extent is in the base
242 * mft record, so do not rely on being able to find the first extent in the
243 * base mft record.
244 *
245 * Warning: Never use @val when looking for attribute types which can be
246 * non-resident as this most likely will result in a crash!
247 */
248static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
249 const u32 name_len, const IGNORE_CASE_BOOL ic,
250 const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
251{
252 ATTR_RECORD *a;
253 ntfs_volume *vol = ctx->ntfs_ino->vol;
254 ntfschar *upcase = vol->upcase;
255 u32 upcase_len = vol->upcase_len;
256
257 /*
258 * Iterate over attributes in mft record starting at @ctx->attr, or the
259 * attribute following that, if @ctx->is_first is TRUE.
260 */
261 if (ctx->is_first) {
262 a = ctx->attr;
263 ctx->is_first = FALSE;
264 } else
265 a = (ATTR_RECORD*)((u8*)ctx->attr +
266 le32_to_cpu(ctx->attr->length));
267 for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
268 if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
269 le32_to_cpu(ctx->mrec->bytes_allocated))
270 break;
271 ctx->attr = a;
272 if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
273 a->type == AT_END))
274 return -ENOENT;
275 if (unlikely(!a->length))
276 break;
277 if (a->type != type)
278 continue;
279 /*
280 * If @name is present, compare the two names. If @name is
281 * missing, assume we want an unnamed attribute.
282 */
283 if (!name) {
284 /* The search failed if the found attribute is named. */
285 if (a->name_length)
286 return -ENOENT;
287 } else if (!ntfs_are_names_equal(name, name_len,
288 (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)),
289 a->name_length, ic, upcase, upcase_len)) {
290 register int rc;
291
292 rc = ntfs_collate_names(name, name_len,
293 (ntfschar*)((u8*)a +
294 le16_to_cpu(a->name_offset)),
295 a->name_length, 1, IGNORE_CASE,
296 upcase, upcase_len);
297 /*
298 * If @name collates before a->name, there is no
299 * matching attribute.
300 */
301 if (rc == -1)
302 return -ENOENT;
303 /* If the strings are not equal, continue search. */
304 if (rc)
305 continue;
306 rc = ntfs_collate_names(name, name_len,
307 (ntfschar*)((u8*)a +
308 le16_to_cpu(a->name_offset)),
309 a->name_length, 1, CASE_SENSITIVE,
310 upcase, upcase_len);
311 if (rc == -1)
312 return -ENOENT;
313 if (rc)
314 continue;
315 }
316 /*
317 * The names match or @name not present and attribute is
318 * unnamed. If no @val specified, we have found the attribute
319 * and are done.
320 */
321 if (!val)
322 return 0;
323 /* @val is present; compare values. */
324 else {
325 register int rc;
326
327 rc = memcmp(val, (u8*)a + le16_to_cpu(
328 a->data.resident.value_offset),
329 min_t(u32, val_len, le32_to_cpu(
330 a->data.resident.value_length)));
331 /*
332 * If @val collates before the current attribute's
333 * value, there is no matching attribute.
334 */
335 if (!rc) {
336 register u32 avl;
337
338 avl = le32_to_cpu(
339 a->data.resident.value_length);
340 if (val_len == avl)
341 return 0;
342 if (val_len < avl)
343 return -ENOENT;
344 } else if (rc < 0)
345 return -ENOENT;
346 }
347 }
348 ntfs_error(vol->sb, "Inode is corrupt. Run chkdsk.");
349 NVolSetErrors(vol);
350 return -EIO;
351}
352
353/**
354 * load_attribute_list - load an attribute list into memory
355 * @vol: ntfs volume from which to read
356 * @runlist: runlist of the attribute list
357 * @al_start: destination buffer
358 * @size: size of the destination buffer in bytes
359 * @initialized_size: initialized size of the attribute list
360 *
361 * Walk the runlist @runlist and load all clusters from it copying them into
362 * the linear buffer @al. The maximum number of bytes copied to @al is @size
363 * bytes. Note, @size does not need to be a multiple of the cluster size. If
364 * @initialized_size is less than @size, the region in @al between
365 * @initialized_size and @size will be zeroed and not read from disk.
366 *
367 * Return 0 on success or -errno on error.
368 */
369int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start,
370 const s64 size, const s64 initialized_size)
371{
372 LCN lcn;
373 u8 *al = al_start;
374 u8 *al_end = al + initialized_size;
375 runlist_element *rl;
376 struct buffer_head *bh;
377 struct super_block *sb;
378 unsigned long block_size;
379 unsigned long block, max_block;
380 int err = 0;
381 unsigned char block_size_bits;
382
383 ntfs_debug("Entering.");
384 if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 ||
385 initialized_size > size)
386 return -EINVAL;
387 if (!initialized_size) {
388 memset(al, 0, size);
389 return 0;
390 }
391 sb = vol->sb;
392 block_size = sb->s_blocksize;
393 block_size_bits = sb->s_blocksize_bits;
394 down_read(&runlist->lock);
395 rl = runlist->rl;
396 /* Read all clusters specified by the runlist one run at a time. */
397 while (rl->length) {
398 lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn);
399 ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
400 (unsigned long long)rl->vcn,
401 (unsigned long long)lcn);
402 /* The attribute list cannot be sparse. */
403 if (lcn < 0) {
404 ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed. Cannot "
405 "read attribute list.");
406 goto err_out;
407 }
408 block = lcn << vol->cluster_size_bits >> block_size_bits;
409 /* Read the run from device in chunks of block_size bytes. */
410 max_block = block + (rl->length << vol->cluster_size_bits >>
411 block_size_bits);
412 ntfs_debug("max_block = 0x%lx.", max_block);
413 do {
414 ntfs_debug("Reading block = 0x%lx.", block);
415 bh = sb_bread(sb, block);
416 if (!bh) {
417 ntfs_error(sb, "sb_bread() failed. Cannot "
418 "read attribute list.");
419 goto err_out;
420 }
421 if (al + block_size >= al_end)
422 goto do_final;
423 memcpy(al, bh->b_data, block_size);
424 brelse(bh);
425 al += block_size;
426 } while (++block < max_block);
427 rl++;
428 }
429 if (initialized_size < size) {
430initialize:
431 memset(al_start + initialized_size, 0, size - initialized_size);
432 }
433done:
434 up_read(&runlist->lock);
435 return err;
436do_final:
437 if (al < al_end) {
438 /*
439 * Partial block.
440 *
441 * Note: The attribute list can be smaller than its allocation
442 * by multiple clusters. This has been encountered by at least
443 * two people running Windows XP, thus we cannot do any
444 * truncation sanity checking here. (AIA)
445 */
446 memcpy(al, bh->b_data, al_end - al);
447 brelse(bh);
448 if (initialized_size < size)
449 goto initialize;
450 goto done;
451 }
452 brelse(bh);
453 /* Real overflow! */
454 ntfs_error(sb, "Attribute list buffer overflow. Read attribute list "
455 "is truncated.");
456err_out:
457 err = -EIO;
458 goto done;
459}
460
461/**
462 * ntfs_external_attr_find - find an attribute in the attribute list of an inode
463 * @type: attribute type to find
464 * @name: attribute name to find (optional, i.e. NULL means don't care)
465 * @name_len: attribute name length (only needed if @name present)
466 * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
467 * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only)
468 * @val: attribute value to find (optional, resident attributes only)
469 * @val_len: attribute value length
470 * @ctx: search context with mft record and attribute to search from
471 *
472 * You should not need to call this function directly. Use ntfs_attr_lookup()
473 * instead.
474 *
475 * Find an attribute by searching the attribute list for the corresponding
476 * attribute list entry. Having found the entry, map the mft record if the
477 * attribute is in a different mft record/inode, ntfs_attr_find() the attribute
478 * in there and return it.
479 *
480 * On first search @ctx->ntfs_ino must be the base mft record and @ctx must
481 * have been obtained from a call to ntfs_attr_get_search_ctx(). On subsequent
482 * calls @ctx->ntfs_ino can be any extent inode, too (@ctx->base_ntfs_ino is
483 * then the base inode).
484 *
485 * After finishing with the attribute/mft record you need to call
486 * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
487 * mapped inodes, etc).
488 *
489 * If the attribute is found, ntfs_external_attr_find() returns 0 and
490 * @ctx->attr will point to the found attribute. @ctx->mrec will point to the
491 * mft record in which @ctx->attr is located and @ctx->al_entry will point to
492 * the attribute list entry for the attribute.
493 *
494 * If the attribute is not found, ntfs_external_attr_find() returns -ENOENT and
495 * @ctx->attr will point to the attribute in the base mft record before which
496 * the attribute being searched for would need to be inserted if such an action
497 * were to be desired. @ctx->mrec will point to the mft record in which
498 * @ctx->attr is located and @ctx->al_entry will point to the attribute list
499 * entry of the attribute before which the attribute being searched for would
500 * need to be inserted if such an action were to be desired.
501 *
502 * Thus to insert the not found attribute, one wants to add the attribute to
503 * @ctx->mrec (the base mft record) and if there is not enough space, the
504 * attribute should be placed in a newly allocated extent mft record. The
505 * attribute list entry for the inserted attribute should be inserted in the
506 * attribute list attribute at @ctx->al_entry.
507 *
508 * On actual error, ntfs_external_attr_find() returns -EIO. In this case
509 * @ctx->attr is undefined and in particular do not rely on it not changing.
510 */
511static int ntfs_external_attr_find(const ATTR_TYPE type,
512 const ntfschar *name, const u32 name_len,
513 const IGNORE_CASE_BOOL ic, const VCN lowest_vcn,
514 const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
515{
516 ntfs_inode *base_ni, *ni;
517 ntfs_volume *vol;
518 ATTR_LIST_ENTRY *al_entry, *next_al_entry;
519 u8 *al_start, *al_end;
520 ATTR_RECORD *a;
521 ntfschar *al_name;
522 u32 al_name_len;
523 int err = 0;
524 static const char *es = " Unmount and run chkdsk.";
525
526 ni = ctx->ntfs_ino;
527 base_ni = ctx->base_ntfs_ino;
528 ntfs_debug("Entering for inode 0x%lx, type 0x%x.", ni->mft_no, type);
529 if (!base_ni) {
530 /* First call happens with the base mft record. */
531 base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino;
532 ctx->base_mrec = ctx->mrec;
533 }
534 if (ni == base_ni)
535 ctx->base_attr = ctx->attr;
536 if (type == AT_END)
537 goto not_found;
538 vol = base_ni->vol;
539 al_start = base_ni->attr_list;
540 al_end = al_start + base_ni->attr_list_size;
541 if (!ctx->al_entry)
542 ctx->al_entry = (ATTR_LIST_ENTRY*)al_start;
543 /*
544 * Iterate over entries in attribute list starting at @ctx->al_entry,
545 * or the entry following that, if @ctx->is_first is TRUE.
546 */
547 if (ctx->is_first) {
548 al_entry = ctx->al_entry;
549 ctx->is_first = FALSE;
550 } else
551 al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry +
552 le16_to_cpu(ctx->al_entry->length));
553 for (;; al_entry = next_al_entry) {
554 /* Out of bounds check. */
555 if ((u8*)al_entry < base_ni->attr_list ||
556 (u8*)al_entry > al_end)
557 break; /* Inode is corrupt. */
558 ctx->al_entry = al_entry;
559 /* Catch the end of the attribute list. */
560 if ((u8*)al_entry == al_end)
561 goto not_found;
562 if (!al_entry->length)
563 break;
564 if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
565 le16_to_cpu(al_entry->length) > al_end)
566 break;
567 next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
568 le16_to_cpu(al_entry->length));
569 if (le32_to_cpu(al_entry->type) > le32_to_cpu(type))
570 goto not_found;
571 if (type != al_entry->type)
572 continue;
573 /*
574 * If @name is present, compare the two names. If @name is
575 * missing, assume we want an unnamed attribute.
576 */
577 al_name_len = al_entry->name_length;
578 al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset);
579 if (!name) {
580 if (al_name_len)
581 goto not_found;
582 } else if (!ntfs_are_names_equal(al_name, al_name_len, name,
583 name_len, ic, vol->upcase, vol->upcase_len)) {
584 register int rc;
585
586 rc = ntfs_collate_names(name, name_len, al_name,
587 al_name_len, 1, IGNORE_CASE,
588 vol->upcase, vol->upcase_len);
589 /*
590 * If @name collates before al_name, there is no
591 * matching attribute.
592 */
593 if (rc == -1)
594 goto not_found;
595 /* If the strings are not equal, continue search. */
596 if (rc)
597 continue;
598 /*
599 * FIXME: Reverse engineering showed 0, IGNORE_CASE but
600 * that is inconsistent with ntfs_attr_find(). The
601 * subsequent rc checks were also different. Perhaps I
602 * made a mistake in one of the two. Need to recheck
603 * which is correct or at least see what is going on...
604 * (AIA)
605 */
606 rc = ntfs_collate_names(name, name_len, al_name,
607 al_name_len, 1, CASE_SENSITIVE,
608 vol->upcase, vol->upcase_len);
609 if (rc == -1)
610 goto not_found;
611 if (rc)
612 continue;
613 }
614 /*
615 * The names match or @name not present and attribute is
616 * unnamed. Now check @lowest_vcn. Continue search if the
617 * next attribute list entry still fits @lowest_vcn. Otherwise
618 * we have reached the right one or the search has failed.
619 */
620 if (lowest_vcn && (u8*)next_al_entry >= al_start &&
621 (u8*)next_al_entry + 6 < al_end &&
622 (u8*)next_al_entry + le16_to_cpu(
623 next_al_entry->length) <= al_end &&
624 sle64_to_cpu(next_al_entry->lowest_vcn) <=
625 lowest_vcn &&
626 next_al_entry->type == al_entry->type &&
627 next_al_entry->name_length == al_name_len &&
628 ntfs_are_names_equal((ntfschar*)((u8*)
629 next_al_entry +
630 next_al_entry->name_offset),
631 next_al_entry->name_length,
632 al_name, al_name_len, CASE_SENSITIVE,
633 vol->upcase, vol->upcase_len))
634 continue;
635 if (MREF_LE(al_entry->mft_reference) == ni->mft_no) {
636 if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) {
637 ntfs_error(vol->sb, "Found stale mft "
638 "reference in attribute list "
639 "of base inode 0x%lx.%s",
640 base_ni->mft_no, es);
641 err = -EIO;
642 break;
643 }
644 } else { /* Mft references do not match. */
645 /* If there is a mapped record unmap it first. */
646 if (ni != base_ni)
647 unmap_extent_mft_record(ni);
648 /* Do we want the base record back? */
649 if (MREF_LE(al_entry->mft_reference) ==
650 base_ni->mft_no) {
651 ni = ctx->ntfs_ino = base_ni;
652 ctx->mrec = ctx->base_mrec;
653 } else {
654 /* We want an extent record. */
655 ctx->mrec = map_extent_mft_record(base_ni,
656 le64_to_cpu(
657 al_entry->mft_reference), &ni);
658 if (IS_ERR(ctx->mrec)) {
659 ntfs_error(vol->sb, "Failed to map "
660 "extent mft record "
661 "0x%lx of base inode "
662 "0x%lx.%s",
663 MREF_LE(al_entry->
664 mft_reference),
665 base_ni->mft_no, es);
666 err = PTR_ERR(ctx->mrec);
667 if (err == -ENOENT)
668 err = -EIO;
669 /* Cause @ctx to be sanitized below. */
670 ni = NULL;
671 break;
672 }
673 ctx->ntfs_ino = ni;
674 }
675 ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
676 le16_to_cpu(ctx->mrec->attrs_offset));
677 }
678 /*
679 * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the
680 * mft record containing the attribute represented by the
681 * current al_entry.
682 */
683 /*
684 * We could call into ntfs_attr_find() to find the right
685 * attribute in this mft record but this would be less
686 * efficient and not quite accurate as ntfs_attr_find() ignores
687 * the attribute instance numbers for example which become
688 * important when one plays with attribute lists. Also,
689 * because a proper match has been found in the attribute list
690 * entry above, the comparison can now be optimized. So it is
691 * worth re-implementing a simplified ntfs_attr_find() here.
692 */
693 a = ctx->attr;
694 /*
695 * Use a manual loop so we can still use break and continue
696 * with the same meanings as above.
697 */
698do_next_attr_loop:
699 if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
700 le32_to_cpu(ctx->mrec->bytes_allocated))
701 break;
702 if (a->type == AT_END)
703 continue;
704 if (!a->length)
705 break;
706 if (al_entry->instance != a->instance)
707 goto do_next_attr;
708 /*
709 * If the type and/or the name are mismatched between the
710 * attribute list entry and the attribute record, there is
711 * corruption so we break and return error EIO.
712 */
713 if (al_entry->type != a->type)
714 break;
715 if (!ntfs_are_names_equal((ntfschar*)((u8*)a +
716 le16_to_cpu(a->name_offset)), a->name_length,
717 al_name, al_name_len, CASE_SENSITIVE,
718 vol->upcase, vol->upcase_len))
719 break;
720 ctx->attr = a;
721 /*
722 * If no @val specified or @val specified and it matches, we
723 * have found it!
724 */
725 if (!val || (!a->non_resident && le32_to_cpu(
726 a->data.resident.value_length) == val_len &&
727 !memcmp((u8*)a +
728 le16_to_cpu(a->data.resident.value_offset),
729 val, val_len))) {
730 ntfs_debug("Done, found.");
731 return 0;
732 }
733do_next_attr:
734 /* Proceed to the next attribute in the current mft record. */
735 a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length));
736 goto do_next_attr_loop;
737 }
738 if (!err) {
739 ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt "
740 "attribute list attribute.%s", base_ni->mft_no,
741 es);
742 err = -EIO;
743 }
744 if (ni != base_ni) {
745 if (ni)
746 unmap_extent_mft_record(ni);
747 ctx->ntfs_ino = base_ni;
748 ctx->mrec = ctx->base_mrec;
749 ctx->attr = ctx->base_attr;
750 }
751 if (err != -ENOMEM)
752 NVolSetErrors(vol);
753 return err;
754not_found:
755 /*
756 * If we were looking for AT_END, we reset the search context @ctx and
757 * use ntfs_attr_find() to seek to the end of the base mft record.
758 */
759 if (type == AT_END) {
760 ntfs_attr_reinit_search_ctx(ctx);
761 return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len,
762 ctx);
763 }
764 /*
765 * The attribute was not found. Before we return, we want to ensure
766 * @ctx->mrec and @ctx->attr indicate the position at which the
767 * attribute should be inserted in the base mft record. Since we also
768 * want to preserve @ctx->al_entry we cannot reinitialize the search
769 * context using ntfs_attr_reinit_search_ctx() as this would set
770 * @ctx->al_entry to NULL. Thus we do the necessary bits manually (see
771 * ntfs_attr_init_search_ctx() below). Note, we _only_ preserve
772 * @ctx->al_entry as the remaining fields (base_*) are identical to
773 * their non base_ counterparts and we cannot set @ctx->base_attr
774 * correctly yet as we do not know what @ctx->attr will be set to by
775 * the call to ntfs_attr_find() below.
776 */
777 if (ni != base_ni)
778 unmap_extent_mft_record(ni);
779 ctx->mrec = ctx->base_mrec;
780 ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
781 le16_to_cpu(ctx->mrec->attrs_offset));
782 ctx->is_first = TRUE;
783 ctx->ntfs_ino = base_ni;
784 ctx->base_ntfs_ino = NULL;
785 ctx->base_mrec = NULL;
786 ctx->base_attr = NULL;
787 /*
788 * In case there are multiple matches in the base mft record, need to
789 * keep enumerating until we get an attribute not found response (or
790 * another error), otherwise we would keep returning the same attribute
791 * over and over again and all programs using us for enumeration would
792 * lock up in a tight loop.
793 */
794 do {
795 err = ntfs_attr_find(type, name, name_len, ic, val, val_len,
796 ctx);
797 } while (!err);
798 ntfs_debug("Done, not found.");
799 return err;
800}
801
802/**
803 * ntfs_attr_lookup - find an attribute in an ntfs inode
804 * @type: attribute type to find
805 * @name: attribute name to find (optional, i.e. NULL means don't care)
806 * @name_len: attribute name length (only needed if @name present)
807 * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
808 * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only)
809 * @val: attribute value to find (optional, resident attributes only)
810 * @val_len: attribute value length
811 * @ctx: search context with mft record and attribute to search from
812 *
813 * Find an attribute in an ntfs inode. On first search @ctx->ntfs_ino must
814 * be the base mft record and @ctx must have been obtained from a call to
815 * ntfs_attr_get_search_ctx().
816 *
817 * This function transparently handles attribute lists and @ctx is used to
818 * continue searches where they were left off at.
819 *
820 * After finishing with the attribute/mft record you need to call
821 * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
822 * mapped inodes, etc).
823 *
824 * Return 0 if the search was successful and -errno if not.
825 *
826 * When 0, @ctx->attr is the found attribute and it is in mft record
827 * @ctx->mrec. If an attribute list attribute is present, @ctx->al_entry is
828 * the attribute list entry of the found attribute.
829 *
830 * When -ENOENT, @ctx->attr is the attribute which collates just after the
831 * attribute being searched for, i.e. if one wants to add the attribute to the
832 * mft record this is the correct place to insert it into. If an attribute
833 * list attribute is present, @ctx->al_entry is the attribute list entry which
834 * collates just after the attribute list entry of the attribute being searched
835 * for, i.e. if one wants to add the attribute to the mft record this is the
836 * correct place to insert its attribute list entry into.
837 *
838 * When -errno != -ENOENT, an error occured during the lookup. @ctx->attr is
839 * then undefined and in particular you should not rely on it not changing.
840 */
841int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
842 const u32 name_len, const IGNORE_CASE_BOOL ic,
843 const VCN lowest_vcn, const u8 *val, const u32 val_len,
844 ntfs_attr_search_ctx *ctx)
845{
846 ntfs_inode *base_ni;
847
848 ntfs_debug("Entering.");
849 if (ctx->base_ntfs_ino)
850 base_ni = ctx->base_ntfs_ino;
851 else
852 base_ni = ctx->ntfs_ino;
853 /* Sanity check, just for debugging really. */
854 BUG_ON(!base_ni);
855 if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST)
856 return ntfs_attr_find(type, name, name_len, ic, val, val_len,
857 ctx);
858 return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn,
859 val, val_len, ctx);
860}
861
862/**
863 * ntfs_attr_init_search_ctx - initialize an attribute search context
864 * @ctx: attribute search context to initialize
865 * @ni: ntfs inode with which to initialize the search context
866 * @mrec: mft record with which to initialize the search context
867 *
868 * Initialize the attribute search context @ctx with @ni and @mrec.
869 */
870static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx,
871 ntfs_inode *ni, MFT_RECORD *mrec)
872{
873 ctx->mrec = mrec;
874 /* Sanity checks are performed elsewhere. */
875 ctx->attr = (ATTR_RECORD*)((u8*)mrec + le16_to_cpu(mrec->attrs_offset));
876 ctx->is_first = TRUE;
877 ctx->ntfs_ino = ni;
878 ctx->al_entry = NULL;
879 ctx->base_ntfs_ino = NULL;
880 ctx->base_mrec = NULL;
881 ctx->base_attr = NULL;
882}
883
884/**
885 * ntfs_attr_reinit_search_ctx - reinitialize an attribute search context
886 * @ctx: attribute search context to reinitialize
887 *
888 * Reinitialize the attribute search context @ctx, unmapping an associated
889 * extent mft record if present, and initialize the search context again.
890 *
891 * This is used when a search for a new attribute is being started to reset
892 * the search context to the beginning.
893 */
894void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx)
895{
896 if (likely(!ctx->base_ntfs_ino)) {
897 /* No attribute list. */
898 ctx->is_first = TRUE;
899 /* Sanity checks are performed elsewhere. */
900 ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
901 le16_to_cpu(ctx->mrec->attrs_offset));
902 /*
903 * This needs resetting due to ntfs_external_attr_find() which
904 * can leave it set despite having zeroed ctx->base_ntfs_ino.
905 */
906 ctx->al_entry = NULL;
907 return;
908 } /* Attribute list. */
909 if (ctx->ntfs_ino != ctx->base_ntfs_ino)
910 unmap_extent_mft_record(ctx->ntfs_ino);
911 ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec);
912 return;
913}
914
915/**
916 * ntfs_attr_get_search_ctx - allocate/initialize a new attribute search context
917 * @ni: ntfs inode with which to initialize the search context
918 * @mrec: mft record with which to initialize the search context
919 *
920 * Allocate a new attribute search context, initialize it with @ni and @mrec,
921 * and return it. Return NULL if allocation failed.
922 */
923ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec)
924{
925 ntfs_attr_search_ctx *ctx;
926
927 ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, SLAB_NOFS);
928 if (ctx)
929 ntfs_attr_init_search_ctx(ctx, ni, mrec);
930 return ctx;
931}
932
933/**
934 * ntfs_attr_put_search_ctx - release an attribute search context
935 * @ctx: attribute search context to free
936 *
937 * Release the attribute search context @ctx, unmapping an associated extent
938 * mft record if present.
939 */
940void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx)
941{
942 if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino)
943 unmap_extent_mft_record(ctx->ntfs_ino);
944 kmem_cache_free(ntfs_attr_ctx_cache, ctx);
945 return;
946}
947
948/**
949 * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file
950 * @vol: ntfs volume to which the attribute belongs
951 * @type: attribute type which to find
952 *
953 * Search for the attribute definition record corresponding to the attribute
954 * @type in the $AttrDef system file.
955 *
956 * Return the attribute type definition record if found and NULL if not found.
957 */
958static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol,
959 const ATTR_TYPE type)
960{
961 ATTR_DEF *ad;
962
963 BUG_ON(!vol->attrdef);
964 BUG_ON(!type);
965 for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef <
966 vol->attrdef_size && ad->type; ++ad) {
967 /* We have not found it yet, carry on searching. */
968 if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type)))
969 continue;
970 /* We found the attribute; return it. */
971 if (likely(ad->type == type))
972 return ad;
973 /* We have gone too far already. No point in continuing. */
974 break;
975 }
976 /* Attribute not found. */
977 ntfs_debug("Attribute type 0x%x not found in $AttrDef.",
978 le32_to_cpu(type));
979 return NULL;
980}
981
982/**
983 * ntfs_attr_size_bounds_check - check a size of an attribute type for validity
984 * @vol: ntfs volume to which the attribute belongs
985 * @type: attribute type which to check
986 * @size: size which to check
987 *
988 * Check whether the @size in bytes is valid for an attribute of @type on the
989 * ntfs volume @vol. This information is obtained from $AttrDef system file.
990 *
991 * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not
992 * listed in $AttrDef.
993 */
994int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type,
995 const s64 size)
996{
997 ATTR_DEF *ad;
998
999 BUG_ON(size < 0);
1000 /*
1001 * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not
1002 * listed in $AttrDef.
1003 */
1004 if (unlikely(type == AT_ATTRIBUTE_LIST && size > 256 * 1024))
1005 return -ERANGE;
1006 /* Get the $AttrDef entry for the attribute @type. */
1007 ad = ntfs_attr_find_in_attrdef(vol, type);
1008 if (unlikely(!ad))
1009 return -ENOENT;
1010 /* Do the bounds check. */
1011 if (((sle64_to_cpu(ad->min_size) > 0) &&
1012 size < sle64_to_cpu(ad->min_size)) ||
1013 ((sle64_to_cpu(ad->max_size) > 0) && size >
1014 sle64_to_cpu(ad->max_size)))
1015 return -ERANGE;
1016 return 0;
1017}
1018
1019/**
1020 * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident
1021 * @vol: ntfs volume to which the attribute belongs
1022 * @type: attribute type which to check
1023 *
1024 * Check whether the attribute of @type on the ntfs volume @vol is allowed to
1025 * be non-resident. This information is obtained from $AttrDef system file.
1026 *
1027 * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, or
1028 * -ENOENT if the attribute is not listed in $AttrDef.
1029 */
1030int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type)
1031{
1032 ATTR_DEF *ad;
1033
1034 /*
1035 * $DATA is always allowed to be non-resident even if $AttrDef does not
1036 * specify this in the flags of the $DATA attribute definition record.
1037 */
1038 if (type == AT_DATA)
1039 return 0;
1040 /* Find the attribute definition record in $AttrDef. */
1041 ad = ntfs_attr_find_in_attrdef(vol, type);
1042 if (unlikely(!ad))
1043 return -ENOENT;
1044 /* Check the flags and return the result. */
1045 if (ad->flags & CAN_BE_NON_RESIDENT)
1046 return 0;
1047 return -EPERM;
1048}
1049
1050/**
1051 * ntfs_attr_can_be_resident - check if an attribute can be resident
1052 * @vol: ntfs volume to which the attribute belongs
1053 * @type: attribute type which to check
1054 *
1055 * Check whether the attribute of @type on the ntfs volume @vol is allowed to
1056 * be resident. This information is derived from our ntfs knowledge and may
1057 * not be completely accurate, especially when user defined attributes are
1058 * present. Basically we allow everything to be resident except for index
1059 * allocation and $EA attributes.
1060 *
1061 * Return 0 if the attribute is allowed to be non-resident and -EPERM if not.
1062 *
1063 * Warning: In the system file $MFT the attribute $Bitmap must be non-resident
1064 * otherwise windows will not boot (blue screen of death)! We cannot
1065 * check for this here as we do not know which inode's $Bitmap is
1066 * being asked about so the caller needs to special case this.
1067 */
1068int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type)
1069{
1070 if (type != AT_INDEX_ALLOCATION && type != AT_EA)
1071 return 0;
1072 return -EPERM;
1073}
1074
1075/**
1076 * ntfs_attr_record_resize - resize an attribute record
1077 * @m: mft record containing attribute record
1078 * @a: attribute record to resize
1079 * @new_size: new size in bytes to which to resize the attribute record @a
1080 *
1081 * Resize the attribute record @a, i.e. the resident part of the attribute, in
1082 * the mft record @m to @new_size bytes.
1083 *
1084 * Return 0 on success and -errno on error. The following error codes are
1085 * defined:
1086 * -ENOSPC - Not enough space in the mft record @m to perform the resize.
1087 *
1088 * Note: On error, no modifications have been performed whatsoever.
1089 *
1090 * Warning: If you make a record smaller without having copied all the data you
1091 * are interested in the data may be overwritten.
1092 */
1093int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size)
1094{
1095 ntfs_debug("Entering for new_size %u.", new_size);
1096 /* Align to 8 bytes if it is not already done. */
1097 if (new_size & 7)
1098 new_size = (new_size + 7) & ~7;
1099 /* If the actual attribute length has changed, move things around. */
1100 if (new_size != le32_to_cpu(a->length)) {
1101 u32 new_muse = le32_to_cpu(m->bytes_in_use) -
1102 le32_to_cpu(a->length) + new_size;
1103 /* Not enough space in this mft record. */
1104 if (new_muse > le32_to_cpu(m->bytes_allocated))
1105 return -ENOSPC;
1106 /* Move attributes following @a to their new location. */
1107 memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length),
1108 le32_to_cpu(m->bytes_in_use) - ((u8*)a -
1109 (u8*)m) - le32_to_cpu(a->length));
1110 /* Adjust @m to reflect the change in used space. */
1111 m->bytes_in_use = cpu_to_le32(new_muse);
1112 /* Adjust @a to reflect the new size. */
1113 if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length))
1114 a->length = cpu_to_le32(new_size);
1115 }
1116 return 0;
1117}
1118
1119/**
1120 * ntfs_attr_set - fill (a part of) an attribute with a byte
1121 * @ni: ntfs inode describing the attribute to fill
1122 * @ofs: offset inside the attribute at which to start to fill
1123 * @cnt: number of bytes to fill
1124 * @val: the unsigned 8-bit value with which to fill the attribute
1125 *
1126 * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at
1127 * byte offset @ofs inside the attribute with the constant byte @val.
1128 *
1129 * This function is effectively like memset() applied to an ntfs attribute.
1130 *
1131 * Return 0 on success and -errno on error. An error code of -ESPIPE means
1132 * that @ofs + @cnt were outside the end of the attribute and no write was
1133 * performed.
1134 */
1135int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
1136{
1137 ntfs_volume *vol = ni->vol;
1138 struct address_space *mapping;
1139 struct page *page;
1140 u8 *kaddr;
1141 pgoff_t idx, end;
1142 unsigned int start_ofs, end_ofs, size;
1143
1144 ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.",
1145 (long long)ofs, (long long)cnt, val);
1146 BUG_ON(ofs < 0);
1147 BUG_ON(cnt < 0);
1148 if (!cnt)
1149 goto done;
1150 mapping = VFS_I(ni)->i_mapping;
1151 /* Work out the starting index and page offset. */
1152 idx = ofs >> PAGE_CACHE_SHIFT;
1153 start_ofs = ofs & ~PAGE_CACHE_MASK;
1154 /* Work out the ending index and page offset. */
1155 end = ofs + cnt;
1156 end_ofs = end & ~PAGE_CACHE_MASK;
1157 /* If the end is outside the inode size return -ESPIPE. */
1158 if (unlikely(end > VFS_I(ni)->i_size)) {
1159 ntfs_error(vol->sb, "Request exceeds end of attribute.");
1160 return -ESPIPE;
1161 }
1162 end >>= PAGE_CACHE_SHIFT;
1163 /* If there is a first partial page, need to do it the slow way. */
1164 if (start_ofs) {
1165 page = read_cache_page(mapping, idx,
1166 (filler_t*)mapping->a_ops->readpage, NULL);
1167 if (IS_ERR(page)) {
1168 ntfs_error(vol->sb, "Failed to read first partial "
1169 "page (sync error, index 0x%lx).", idx);
1170 return PTR_ERR(page);
1171 }
1172 wait_on_page_locked(page);
1173 if (unlikely(!PageUptodate(page))) {
1174 ntfs_error(vol->sb, "Failed to read first partial page "
1175 "(async error, index 0x%lx).", idx);
1176 page_cache_release(page);
1177 return PTR_ERR(page);
1178 }
1179 /*
1180 * If the last page is the same as the first page, need to
1181 * limit the write to the end offset.
1182 */
1183 size = PAGE_CACHE_SIZE;
1184 if (idx == end)
1185 size = end_ofs;
1186 kaddr = kmap_atomic(page, KM_USER0);
1187 memset(kaddr + start_ofs, val, size - start_ofs);
1188 flush_dcache_page(page);
1189 kunmap_atomic(kaddr, KM_USER0);
1190 set_page_dirty(page);
1191 page_cache_release(page);
1192 if (idx == end)
1193 goto done;
1194 idx++;
1195 }
1196 /* Do the whole pages the fast way. */
1197 for (; idx < end; idx++) {
1198 /* Find or create the current page. (The page is locked.) */
1199 page = grab_cache_page(mapping, idx);
1200 if (unlikely(!page)) {
1201 ntfs_error(vol->sb, "Insufficient memory to grab "
1202 "page (index 0x%lx).", idx);
1203 return -ENOMEM;
1204 }
1205 kaddr = kmap_atomic(page, KM_USER0);
1206 memset(kaddr, val, PAGE_CACHE_SIZE);
1207 flush_dcache_page(page);
1208 kunmap_atomic(kaddr, KM_USER0);
1209 /*
1210 * If the page has buffers, mark them uptodate since buffer
1211 * state and not page state is definitive in 2.6 kernels.
1212 */
1213 if (page_has_buffers(page)) {
1214 struct buffer_head *bh, *head;
1215
1216 bh = head = page_buffers(page);
1217 do {
1218 set_buffer_uptodate(bh);
1219 } while ((bh = bh->b_this_page) != head);
1220 }
1221 /* Now that buffers are uptodate, set the page uptodate, too. */
1222 SetPageUptodate(page);
1223 /*
1224 * Set the page and all its buffers dirty and mark the inode
1225 * dirty, too. The VM will write the page later on.
1226 */
1227 set_page_dirty(page);
1228 /* Finally unlock and release the page. */
1229 unlock_page(page);
1230 page_cache_release(page);
1231 }
1232 /* If there is a last partial page, need to do it the slow way. */
1233 if (end_ofs) {
1234 page = read_cache_page(mapping, idx,
1235 (filler_t*)mapping->a_ops->readpage, NULL);
1236 if (IS_ERR(page)) {
1237 ntfs_error(vol->sb, "Failed to read last partial page "
1238 "(sync error, index 0x%lx).", idx);
1239 return PTR_ERR(page);
1240 }
1241 wait_on_page_locked(page);
1242 if (unlikely(!PageUptodate(page))) {
1243 ntfs_error(vol->sb, "Failed to read last partial page "
1244 "(async error, index 0x%lx).", idx);
1245 page_cache_release(page);
1246 return PTR_ERR(page);
1247 }
1248 kaddr = kmap_atomic(page, KM_USER0);
1249 memset(kaddr, val, end_ofs);
1250 flush_dcache_page(page);
1251 kunmap_atomic(kaddr, KM_USER0);
1252 set_page_dirty(page);
1253 page_cache_release(page);
1254 }
1255done:
1256 ntfs_debug("Done.");
1257 return 0;
1258}
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
new file mode 100644
index 000000000000..e0c2c6c81bc0
--- /dev/null
+++ b/fs/ntfs/attrib.h
@@ -0,0 +1,100 @@
1/*
2 * attrib.h - Defines for attribute handling in NTFS Linux kernel driver.
3 * Part of the Linux-NTFS project.
4 *
5 * Copyright (c) 2001-2004 Anton Altaparmakov
6 * Copyright (c) 2002 Richard Russon
7 *
8 * This program/include file is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as published
10 * by the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program/include file is distributed in the hope that it will be
14 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program (in the main directory of the Linux-NTFS
20 * distribution in the file COPYING); if not, write to the Free Software
21 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24#ifndef _LINUX_NTFS_ATTRIB_H
25#define _LINUX_NTFS_ATTRIB_H
26
27#include "endian.h"
28#include "types.h"
29#include "layout.h"
30#include "inode.h"
31#include "runlist.h"
32#include "volume.h"
33
34/**
35 * ntfs_attr_search_ctx - used in attribute search functions
36 * @mrec: buffer containing mft record to search
37 * @attr: attribute record in @mrec where to begin/continue search
38 * @is_first: if true ntfs_attr_lookup() begins search with @attr, else after
39 *
40 * Structure must be initialized to zero before the first call to one of the
41 * attribute search functions. Initialize @mrec to point to the mft record to
42 * search, and @attr to point to the first attribute within @mrec (not necessary
43 * if calling the _first() functions), and set @is_first to TRUE (not necessary
44 * if calling the _first() functions).
45 *
46 * If @is_first is TRUE, the search begins with @attr. If @is_first is FALSE,
47 * the search begins after @attr. This is so that, after the first call to one
48 * of the search attribute functions, we can call the function again, without
49 * any modification of the search context, to automagically get the next
50 * matching attribute.
51 */
52typedef struct {
53 MFT_RECORD *mrec;
54 ATTR_RECORD *attr;
55 BOOL is_first;
56 ntfs_inode *ntfs_ino;
57 ATTR_LIST_ENTRY *al_entry;
58 ntfs_inode *base_ntfs_ino;
59 MFT_RECORD *base_mrec;
60 ATTR_RECORD *base_attr;
61} ntfs_attr_search_ctx;
62
63extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
64
65extern runlist_element *ntfs_find_vcn(ntfs_inode *ni, const VCN vcn,
66 const BOOL need_write);
67
68int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
69 const u32 name_len, const IGNORE_CASE_BOOL ic,
70 const VCN lowest_vcn, const u8 *val, const u32 val_len,
71 ntfs_attr_search_ctx *ctx);
72
73extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start,
74 const s64 size, const s64 initialized_size);
75
76static inline s64 ntfs_attr_size(const ATTR_RECORD *a)
77{
78 if (!a->non_resident)
79 return (s64)le32_to_cpu(a->data.resident.value_length);
80 return sle64_to_cpu(a->data.non_resident.data_size);
81}
82
83extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx);
84extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni,
85 MFT_RECORD *mrec);
86extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx);
87
88extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol,
89 const ATTR_TYPE type, const s64 size);
90extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol,
91 const ATTR_TYPE type);
92extern int ntfs_attr_can_be_resident(const ntfs_volume *vol,
93 const ATTR_TYPE type);
94
95extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
96
97extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt,
98 const u8 val);
99
100#endif /* _LINUX_NTFS_ATTRIB_H */
diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
new file mode 100644
index 000000000000..12cf2e30c7dd
--- /dev/null
+++ b/fs/ntfs/bitmap.c
@@ -0,0 +1,192 @@
1/*
2 * bitmap.c - NTFS kernel bitmap handling. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2004 Anton Altaparmakov
5 *
6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published
8 * by the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program/include file is distributed in the hope that it will be
12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program (in the main directory of the Linux-NTFS
18 * distribution in the file COPYING); if not, write to the Free Software
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#ifdef NTFS_RW
23
24#include <linux/pagemap.h>
25
26#include "bitmap.h"
27#include "debug.h"
28#include "aops.h"
29#include "ntfs.h"
30
31/**
32 * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
33 * @vi: vfs inode describing the bitmap
34 * @start_bit: first bit to set
35 * @count: number of bits to set
36 * @value: value to set the bits to (i.e. 0 or 1)
37 * @is_rollback: if TRUE this is a rollback operation
38 *
39 * Set @count bits starting at bit @start_bit in the bitmap described by the
40 * vfs inode @vi to @value, where @value is either 0 or 1.
41 *
42 * @is_rollback should always be FALSE, it is for internal use to rollback
43 * errors. You probably want to use ntfs_bitmap_set_bits_in_run() instead.
44 *
45 * Return 0 on success and -errno on error.
46 */
47int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
48 const s64 count, const u8 value, const BOOL is_rollback)
49{
50 s64 cnt = count;
51 pgoff_t index, end_index;
52 struct address_space *mapping;
53 struct page *page;
54 u8 *kaddr;
55 int pos, len;
56 u8 bit;
57
58 BUG_ON(!vi);
59 ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, "
60 "value %u.%s", vi->i_ino, (unsigned long long)start_bit,
61 (unsigned long long)cnt, (unsigned int)value,
62 is_rollback ? " (rollback)" : "");
63 BUG_ON(start_bit < 0);
64 BUG_ON(cnt < 0);
65 BUG_ON(value > 1);
66 /*
67 * Calculate the indices for the pages containing the first and last
68 * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively.
69 */
70 index = start_bit >> (3 + PAGE_CACHE_SHIFT);
71 end_index = (start_bit + cnt - 1) >> (3 + PAGE_CACHE_SHIFT);
72
73 /* Get the page containing the first bit (@start_bit). */
74 mapping = vi->i_mapping;
75 page = ntfs_map_page(mapping, index);
76 if (IS_ERR(page)) {
77 if (!is_rollback)
78 ntfs_error(vi->i_sb, "Failed to map first page (error "
79 "%li), aborting.", PTR_ERR(page));
80 return PTR_ERR(page);
81 }
82 kaddr = page_address(page);
83
84 /* Set @pos to the position of the byte containing @start_bit. */
85 pos = (start_bit >> 3) & ~PAGE_CACHE_MASK;
86
87 /* Calculate the position of @start_bit in the first byte. */
88 bit = start_bit & 7;
89
90 /* If the first byte is partial, modify the appropriate bits in it. */
91 if (bit) {
92 u8 *byte = kaddr + pos;
93 while ((bit & 7) && cnt--) {
94 if (value)
95 *byte |= 1 << bit++;
96 else
97 *byte &= ~(1 << bit++);
98 }
99 /* If we are done, unmap the page and return success. */
100 if (!cnt)
101 goto done;
102
103 /* Update @pos to the new position. */
104 pos++;
105 }
106 /*
107 * Depending on @value, modify all remaining whole bytes in the page up
108 * to @cnt.
109 */
110 len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE - pos);
111 memset(kaddr + pos, value ? 0xff : 0, len);
112 cnt -= len << 3;
113
114 /* Update @len to point to the first not-done byte in the page. */
115 if (cnt < 8)
116 len += pos;
117
118 /* If we are not in the last page, deal with all subsequent pages. */
119 while (index < end_index) {
120 BUG_ON(cnt <= 0);
121
122 /* Update @index and get the next page. */
123 flush_dcache_page(page);
124 set_page_dirty(page);
125 ntfs_unmap_page(page);
126 page = ntfs_map_page(mapping, ++index);
127 if (IS_ERR(page))
128 goto rollback;
129 kaddr = page_address(page);
130 /*
131 * Depending on @value, modify all remaining whole bytes in the
132 * page up to @cnt.
133 */
134 len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE);
135 memset(kaddr, value ? 0xff : 0, len);
136 cnt -= len << 3;
137 }
138 /*
139 * The currently mapped page is the last one. If the last byte is
140 * partial, modify the appropriate bits in it. Note, @len is the
141 * position of the last byte inside the page.
142 */
143 if (cnt) {
144 u8 *byte;
145
146 BUG_ON(cnt > 7);
147
148 bit = cnt;
149 byte = kaddr + len;
150 while (bit--) {
151 if (value)
152 *byte |= 1 << bit;
153 else
154 *byte &= ~(1 << bit);
155 }
156 }
157done:
158 /* We are done. Unmap the page and return success. */
159 flush_dcache_page(page);
160 set_page_dirty(page);
161 ntfs_unmap_page(page);
162 ntfs_debug("Done.");
163 return 0;
164rollback:
165 /*
166 * Current state:
167 * - no pages are mapped
168 * - @count - @cnt is the number of bits that have been modified
169 */
170 if (is_rollback)
171 return PTR_ERR(page);
172 if (count != cnt)
173 pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt,
174 value ? 0 : 1, TRUE);
175 else
176 pos = 0;
177 if (!pos) {
178 /* Rollback was successful. */
179 ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
180 "%li), aborting.", PTR_ERR(page));
181 } else {
182 /* Rollback failed. */
183 ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
184 "%li) and rollback failed (error %i). "
185 "Aborting and leaving inconsistent metadata. "
186 "Unmount and run chkdsk.", PTR_ERR(page), pos);
187 NVolSetErrors(NTFS_SB(vi->i_sb));
188 }
189 return PTR_ERR(page);
190}
191
192#endif /* NTFS_RW */
diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h
new file mode 100644
index 000000000000..bb50d6bc9212
--- /dev/null
+++ b/fs/ntfs/bitmap.h
@@ -0,0 +1,118 @@
1/*
2 * bitmap.h - Defines for NTFS kernel bitmap handling. Part of the Linux-NTFS
3 * project.
4 *
5 * Copyright (c) 2004 Anton Altaparmakov
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#ifndef _LINUX_NTFS_BITMAP_H
24#define _LINUX_NTFS_BITMAP_H
25
26#ifdef NTFS_RW
27
28#include <linux/fs.h>
29
30#include "types.h"
31
32extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
33 const s64 count, const u8 value, const BOOL is_rollback);
34
35/**
36 * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
37 * @vi: vfs inode describing the bitmap
38 * @start_bit: first bit to set
39 * @count: number of bits to set
40 * @value: value to set the bits to (i.e. 0 or 1)
41 *
42 * Set @count bits starting at bit @start_bit in the bitmap described by the
43 * vfs inode @vi to @value, where @value is either 0 or 1.
44 *
45 * Return 0 on success and -errno on error.
46 */
47static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi,
48 const s64 start_bit, const s64 count, const u8 value)
49{
50 return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value,
51 FALSE);
52}
53
54/**
55 * ntfs_bitmap_set_run - set a run of bits in a bitmap
56 * @vi: vfs inode describing the bitmap
57 * @start_bit: first bit to set
58 * @count: number of bits to set
59 *
60 * Set @count bits starting at bit @start_bit in the bitmap described by the
61 * vfs inode @vi.
62 *
63 * Return 0 on success and -errno on error.
64 */
65static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit,
66 const s64 count)
67{
68 return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 1);
69}
70
71/**
72 * ntfs_bitmap_clear_run - clear a run of bits in a bitmap
73 * @vi: vfs inode describing the bitmap
74 * @start_bit: first bit to clear
75 * @count: number of bits to clear
76 *
77 * Clear @count bits starting at bit @start_bit in the bitmap described by the
78 * vfs inode @vi.
79 *
80 * Return 0 on success and -errno on error.
81 */
82static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit,
83 const s64 count)
84{
85 return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 0);
86}
87
88/**
89 * ntfs_bitmap_set_bit - set a bit in a bitmap
90 * @vi: vfs inode describing the bitmap
91 * @bit: bit to set
92 *
93 * Set bit @bit in the bitmap described by the vfs inode @vi.
94 *
95 * Return 0 on success and -errno on error.
96 */
97static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit)
98{
99 return ntfs_bitmap_set_run(vi, bit, 1);
100}
101
102/**
103 * ntfs_bitmap_clear_bit - clear a bit in a bitmap
104 * @vi: vfs inode describing the bitmap
105 * @bit: bit to clear
106 *
107 * Clear bit @bit in the bitmap described by the vfs inode @vi.
108 *
109 * Return 0 on success and -errno on error.
110 */
111static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit)
112{
113 return ntfs_bitmap_clear_run(vi, bit, 1);
114}
115
116#endif /* NTFS_RW */
117
118#endif /* defined _LINUX_NTFS_BITMAP_H */
diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c
new file mode 100644
index 000000000000..4a28ab3898ef
--- /dev/null
+++ b/fs/ntfs/collate.c
@@ -0,0 +1,124 @@
1/*
2 * collate.c - NTFS kernel collation handling. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2004 Anton Altaparmakov
5 *
6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published
8 * by the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program/include file is distributed in the hope that it will be
12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program (in the main directory of the Linux-NTFS
18 * distribution in the file COPYING); if not, write to the Free Software
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include "collate.h"
23#include "debug.h"
24#include "ntfs.h"
25
26static int ntfs_collate_binary(ntfs_volume *vol,
27 const void *data1, const int data1_len,
28 const void *data2, const int data2_len)
29{
30 int rc;
31
32 ntfs_debug("Entering.");
33 rc = memcmp(data1, data2, min(data1_len, data2_len));
34 if (!rc && (data1_len != data2_len)) {
35 if (data1_len < data2_len)
36 rc = -1;
37 else
38 rc = 1;
39 }
40 ntfs_debug("Done, returning %i", rc);
41 return rc;
42}
43
44static int ntfs_collate_ntofs_ulong(ntfs_volume *vol,
45 const void *data1, const int data1_len,
46 const void *data2, const int data2_len)
47{
48 int rc;
49 u32 d1, d2;
50
51 ntfs_debug("Entering.");
52 // FIXME: We don't really want to bug here.
53 BUG_ON(data1_len != data2_len);
54 BUG_ON(data1_len != 4);
55 d1 = le32_to_cpup(data1);
56 d2 = le32_to_cpup(data2);
57 if (d1 < d2)
58 rc = -1;
59 else {
60 if (d1 == d2)
61 rc = 0;
62 else
63 rc = 1;
64 }
65 ntfs_debug("Done, returning %i", rc);
66 return rc;
67}
68
69typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int,
70 const void *, const int);
71
72static ntfs_collate_func_t ntfs_do_collate0x0[3] = {
73 ntfs_collate_binary,
74 NULL/*ntfs_collate_file_name*/,
75 NULL/*ntfs_collate_unicode_string*/,
76};
77
78static ntfs_collate_func_t ntfs_do_collate0x1[4] = {
79 ntfs_collate_ntofs_ulong,
80 NULL/*ntfs_collate_ntofs_sid*/,
81 NULL/*ntfs_collate_ntofs_security_hash*/,
82 NULL/*ntfs_collate_ntofs_ulongs*/,
83};
84
85/**
86 * ntfs_collate - collate two data items using a specified collation rule
87 * @vol: ntfs volume to which the data items belong
88 * @cr: collation rule to use when comparing the items
89 * @data1: first data item to collate
90 * @data1_len: length in bytes of @data1
91 * @data2: second data item to collate
92 * @data2_len: length in bytes of @data2
93 *
94 * Collate the two data items @data1 and @data2 using the collation rule @cr
95 * and return -1, 0, ir 1 if @data1 is found, respectively, to collate before,
96 * to match, or to collate after @data2.
97 *
98 * For speed we use the collation rule @cr as an index into two tables of
99 * function pointers to call the appropriate collation function.
100 */
101int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
102 const void *data1, const int data1_len,
103 const void *data2, const int data2_len) {
104 int i;
105
106 ntfs_debug("Entering.");
107 /*
108 * FIXME: At the moment we only support COLLATION_BINARY and
109 * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now.
110 */
111 BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG);
112 i = le32_to_cpu(cr);
113 BUG_ON(i < 0);
114 if (i <= 0x02)
115 return ntfs_do_collate0x0[i](vol, data1, data1_len,
116 data2, data2_len);
117 BUG_ON(i < 0x10);
118 i -= 0x10;
119 if (likely(i <= 3))
120 return ntfs_do_collate0x1[i](vol, data1, data1_len,
121 data2, data2_len);
122 BUG();
123 return 0;
124}
diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h
new file mode 100644
index 000000000000..e027f36fcc2f
--- /dev/null
+++ b/fs/ntfs/collate.h
@@ -0,0 +1,50 @@
1/*
2 * collate.h - Defines for NTFS kernel collation handling. Part of the
3 * Linux-NTFS project.
4 *
5 * Copyright (c) 2004 Anton Altaparmakov
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#ifndef _LINUX_NTFS_COLLATE_H
24#define _LINUX_NTFS_COLLATE_H
25
26#include "types.h"
27#include "volume.h"
28
29static inline BOOL ntfs_is_collation_rule_supported(COLLATION_RULE cr) {
30 int i;
31
32 /*
33 * FIXME: At the moment we only support COLLATION_BINARY and
34 * COLLATION_NTOFS_ULONG, so we return false for everything else for
35 * now.
36 */
37 if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG))
38 return FALSE;
39 i = le32_to_cpu(cr);
40 if (likely(((i >= 0) && (i <= 0x02)) ||
41 ((i >= 0x10) && (i <= 0x13))))
42 return TRUE;
43 return FALSE;
44}
45
46extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
47 const void *data1, const int data1_len,
48 const void *data2, const int data2_len);
49
50#endif /* _LINUX_NTFS_COLLATE_H */
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
new file mode 100644
index 000000000000..ee5ae706f861
--- /dev/null
+++ b/fs/ntfs/compress.c
@@ -0,0 +1,957 @@
1/**
2 * compress.c - NTFS kernel compressed attributes handling.
3 * Part of the Linux-NTFS project.
4 *
5 * Copyright (c) 2001-2004 Anton Altaparmakov
6 * Copyright (c) 2002 Richard Russon
7 *
8 * This program/include file is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as published
10 * by the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program/include file is distributed in the hope that it will be
14 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program (in the main directory of the Linux-NTFS
20 * distribution in the file COPYING); if not, write to the Free Software
21 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24#include <linux/fs.h>
25#include <linux/buffer_head.h>
26#include <linux/blkdev.h>
27#include <linux/vmalloc.h>
28
29#include "attrib.h"
30#include "inode.h"
31#include "debug.h"
32#include "ntfs.h"
33
34/**
35 * ntfs_compression_constants - enum of constants used in the compression code
36 */
37typedef enum {
38 /* Token types and access mask. */
39 NTFS_SYMBOL_TOKEN = 0,
40 NTFS_PHRASE_TOKEN = 1,
41 NTFS_TOKEN_MASK = 1,
42
43 /* Compression sub-block constants. */
44 NTFS_SB_SIZE_MASK = 0x0fff,
45 NTFS_SB_SIZE = 0x1000,
46 NTFS_SB_IS_COMPRESSED = 0x8000,
47
48 /*
49 * The maximum compression block size is by definition 16 * the cluster
50 * size, with the maximum supported cluster size being 4kiB. Thus the
51 * maximum compression buffer size is 64kiB, so we use this when
52 * initializing the compression buffer.
53 */
54 NTFS_MAX_CB_SIZE = 64 * 1024,
55} ntfs_compression_constants;
56
57/**
58 * ntfs_compression_buffer - one buffer for the decompression engine
59 */
60static u8 *ntfs_compression_buffer = NULL;
61
62/**
63 * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer
64 */
65static DEFINE_SPINLOCK(ntfs_cb_lock);
66
67/**
68 * allocate_compression_buffers - allocate the decompression buffers
69 *
70 * Caller has to hold the ntfs_lock semaphore.
71 *
72 * Return 0 on success or -ENOMEM if the allocations failed.
73 */
74int allocate_compression_buffers(void)
75{
76 BUG_ON(ntfs_compression_buffer);
77
78 ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE);
79 if (!ntfs_compression_buffer)
80 return -ENOMEM;
81 return 0;
82}
83
84/**
85 * free_compression_buffers - free the decompression buffers
86 *
87 * Caller has to hold the ntfs_lock semaphore.
88 */
89void free_compression_buffers(void)
90{
91 BUG_ON(!ntfs_compression_buffer);
92 vfree(ntfs_compression_buffer);
93 ntfs_compression_buffer = NULL;
94}
95
96/**
97 * zero_partial_compressed_page - zero out of bounds compressed page region
98 */
99static void zero_partial_compressed_page(ntfs_inode *ni, struct page *page)
100{
101 u8 *kp = page_address(page);
102 unsigned int kp_ofs;
103
104 ntfs_debug("Zeroing page region outside initialized size.");
105 if (((s64)page->index << PAGE_CACHE_SHIFT) >= ni->initialized_size) {
106 /*
107 * FIXME: Using clear_page() will become wrong when we get
108 * PAGE_CACHE_SIZE != PAGE_SIZE but for now there is no problem.
109 */
110 clear_page(kp);
111 return;
112 }
113 kp_ofs = ni->initialized_size & ~PAGE_CACHE_MASK;
114 memset(kp + kp_ofs, 0, PAGE_CACHE_SIZE - kp_ofs);
115 return;
116}
117
118/**
119 * handle_bounds_compressed_page - test for&handle out of bounds compressed page
120 */
121static inline void handle_bounds_compressed_page(ntfs_inode *ni,
122 struct page *page)
123{
124 if ((page->index >= (ni->initialized_size >> PAGE_CACHE_SHIFT)) &&
125 (ni->initialized_size < VFS_I(ni)->i_size))
126 zero_partial_compressed_page(ni, page);
127 return;
128}
129
130/**
131 * ntfs_decompress - decompress a compression block into an array of pages
132 * @dest_pages: destination array of pages
133 * @dest_index: current index into @dest_pages (IN/OUT)
134 * @dest_ofs: current offset within @dest_pages[@dest_index] (IN/OUT)
135 * @dest_max_index: maximum index into @dest_pages (IN)
136 * @dest_max_ofs: maximum offset within @dest_pages[@dest_max_index] (IN)
137 * @xpage: the target page (-1 if none) (IN)
138 * @xpage_done: set to 1 if xpage was completed successfully (IN/OUT)
139 * @cb_start: compression block to decompress (IN)
140 * @cb_size: size of compression block @cb_start in bytes (IN)
141 *
142 * The caller must have disabled preemption. ntfs_decompress() reenables it when
143 * the critical section is finished.
144 *
145 * This decompresses the compression block @cb_start into the array of
146 * destination pages @dest_pages starting at index @dest_index into @dest_pages
147 * and at offset @dest_pos into the page @dest_pages[@dest_index].
148 *
149 * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1.
150 * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified.
151 *
152 * @cb_start is a pointer to the compression block which needs decompressing
153 * and @cb_size is the size of @cb_start in bytes (8-64kiB).
154 *
155 * Return 0 if success or -EOVERFLOW on error in the compressed stream.
156 * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was
157 * completed during the decompression of the compression block (@cb_start).
158 *
159 * Warning: This function *REQUIRES* PAGE_CACHE_SIZE >= 4096 or it will blow up
160 * unpredicatbly! You have been warned!
161 *
162 * Note to hackers: This function may not sleep until it has finished accessing
163 * the compression block @cb_start as it is a per-CPU buffer.
164 */
165static int ntfs_decompress(struct page *dest_pages[], int *dest_index,
166 int *dest_ofs, const int dest_max_index, const int dest_max_ofs,
167 const int xpage, char *xpage_done, u8 *const cb_start,
168 const u32 cb_size)
169{
170 /*
171 * Pointers into the compressed data, i.e. the compression block (cb),
172 * and the therein contained sub-blocks (sb).
173 */
174 u8 *cb_end = cb_start + cb_size; /* End of cb. */
175 u8 *cb = cb_start; /* Current position in cb. */
176 u8 *cb_sb_start = cb; /* Beginning of the current sb in the cb. */
177 u8 *cb_sb_end; /* End of current sb / beginning of next sb. */
178
179 /* Variables for uncompressed data / destination. */
180 struct page *dp; /* Current destination page being worked on. */
181 u8 *dp_addr; /* Current pointer into dp. */
182 u8 *dp_sb_start; /* Start of current sub-block in dp. */
183 u8 *dp_sb_end; /* End of current sb in dp (dp_sb_start +
184 NTFS_SB_SIZE). */
185 u16 do_sb_start; /* @dest_ofs when starting this sub-block. */
186 u16 do_sb_end; /* @dest_ofs of end of this sb (do_sb_start +
187 NTFS_SB_SIZE). */
188
189 /* Variables for tag and token parsing. */
190 u8 tag; /* Current tag. */
191 int token; /* Loop counter for the eight tokens in tag. */
192
193 /* Need this because we can't sleep, so need two stages. */
194 int completed_pages[dest_max_index - *dest_index + 1];
195 int nr_completed_pages = 0;
196
197 /* Default error code. */
198 int err = -EOVERFLOW;
199
200 ntfs_debug("Entering, cb_size = 0x%x.", cb_size);
201do_next_sb:
202 ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.",
203 cb - cb_start);
204 /*
205 * Have we reached the end of the compression block or the end of the
206 * decompressed data? The latter can happen for example if the current
207 * position in the compression block is one byte before its end so the
208 * first two checks do not detect it.
209 */
210 if (cb == cb_end || !le16_to_cpup((le16*)cb) ||
211 (*dest_index == dest_max_index &&
212 *dest_ofs == dest_max_ofs)) {
213 int i;
214
215 ntfs_debug("Completed. Returning success (0).");
216 err = 0;
217return_error:
218 /* We can sleep from now on, so we drop lock. */
219 spin_unlock(&ntfs_cb_lock);
220 /* Second stage: finalize completed pages. */
221 if (nr_completed_pages > 0) {
222 struct page *page = dest_pages[completed_pages[0]];
223 ntfs_inode *ni = NTFS_I(page->mapping->host);
224
225 for (i = 0; i < nr_completed_pages; i++) {
226 int di = completed_pages[i];
227
228 dp = dest_pages[di];
229 /*
230 * If we are outside the initialized size, zero
231 * the out of bounds page range.
232 */
233 handle_bounds_compressed_page(ni, dp);
234 flush_dcache_page(dp);
235 kunmap(dp);
236 SetPageUptodate(dp);
237 unlock_page(dp);
238 if (di == xpage)
239 *xpage_done = 1;
240 else
241 page_cache_release(dp);
242 dest_pages[di] = NULL;
243 }
244 }
245 return err;
246 }
247
248 /* Setup offsets for the current sub-block destination. */
249 do_sb_start = *dest_ofs;
250 do_sb_end = do_sb_start + NTFS_SB_SIZE;
251
252 /* Check that we are still within allowed boundaries. */
253 if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs)
254 goto return_overflow;
255
256 /* Does the minimum size of a compressed sb overflow valid range? */
257 if (cb + 6 > cb_end)
258 goto return_overflow;
259
260 /* Setup the current sub-block source pointers and validate range. */
261 cb_sb_start = cb;
262 cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK)
263 + 3;
264 if (cb_sb_end > cb_end)
265 goto return_overflow;
266
267 /* Get the current destination page. */
268 dp = dest_pages[*dest_index];
269 if (!dp) {
270 /* No page present. Skip decompression of this sub-block. */
271 cb = cb_sb_end;
272
273 /* Advance destination position to next sub-block. */
274 *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_CACHE_MASK;
275 if (!*dest_ofs && (++*dest_index > dest_max_index))
276 goto return_overflow;
277 goto do_next_sb;
278 }
279
280 /* We have a valid destination page. Setup the destination pointers. */
281 dp_addr = (u8*)page_address(dp) + do_sb_start;
282
283 /* Now, we are ready to process the current sub-block (sb). */
284 if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) {
285 ntfs_debug("Found uncompressed sub-block.");
286 /* This sb is not compressed, just copy it into destination. */
287
288 /* Advance source position to first data byte. */
289 cb += 2;
290
291 /* An uncompressed sb must be full size. */
292 if (cb_sb_end - cb != NTFS_SB_SIZE)
293 goto return_overflow;
294
295 /* Copy the block and advance the source position. */
296 memcpy(dp_addr, cb, NTFS_SB_SIZE);
297 cb += NTFS_SB_SIZE;
298
299 /* Advance destination position to next sub-block. */
300 *dest_ofs += NTFS_SB_SIZE;
301 if (!(*dest_ofs &= ~PAGE_CACHE_MASK)) {
302finalize_page:
303 /*
304 * First stage: add current page index to array of
305 * completed pages.
306 */
307 completed_pages[nr_completed_pages++] = *dest_index;
308 if (++*dest_index > dest_max_index)
309 goto return_overflow;
310 }
311 goto do_next_sb;
312 }
313 ntfs_debug("Found compressed sub-block.");
314 /* This sb is compressed, decompress it into destination. */
315
316 /* Setup destination pointers. */
317 dp_sb_start = dp_addr;
318 dp_sb_end = dp_sb_start + NTFS_SB_SIZE;
319
320 /* Forward to the first tag in the sub-block. */
321 cb += 2;
322do_next_tag:
323 if (cb == cb_sb_end) {
324 /* Check if the decompressed sub-block was not full-length. */
325 if (dp_addr < dp_sb_end) {
326 int nr_bytes = do_sb_end - *dest_ofs;
327
328 ntfs_debug("Filling incomplete sub-block with "
329 "zeroes.");
330 /* Zero remainder and update destination position. */
331 memset(dp_addr, 0, nr_bytes);
332 *dest_ofs += nr_bytes;
333 }
334 /* We have finished the current sub-block. */
335 if (!(*dest_ofs &= ~PAGE_CACHE_MASK))
336 goto finalize_page;
337 goto do_next_sb;
338 }
339
340 /* Check we are still in range. */
341 if (cb > cb_sb_end || dp_addr > dp_sb_end)
342 goto return_overflow;
343
344 /* Get the next tag and advance to first token. */
345 tag = *cb++;
346
347 /* Parse the eight tokens described by the tag. */
348 for (token = 0; token < 8; token++, tag >>= 1) {
349 u16 lg, pt, length, max_non_overlap;
350 register u16 i;
351 u8 *dp_back_addr;
352
353 /* Check if we are done / still in range. */
354 if (cb >= cb_sb_end || dp_addr > dp_sb_end)
355 break;
356
357 /* Determine token type and parse appropriately.*/
358 if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) {
359 /*
360 * We have a symbol token, copy the symbol across, and
361 * advance the source and destination positions.
362 */
363 *dp_addr++ = *cb++;
364 ++*dest_ofs;
365
366 /* Continue with the next token. */
367 continue;
368 }
369
370 /*
371 * We have a phrase token. Make sure it is not the first tag in
372 * the sb as this is illegal and would confuse the code below.
373 */
374 if (dp_addr == dp_sb_start)
375 goto return_overflow;
376
377 /*
378 * Determine the number of bytes to go back (p) and the number
379 * of bytes to copy (l). We use an optimized algorithm in which
380 * we first calculate log2(current destination position in sb),
381 * which allows determination of l and p in O(1) rather than
382 * O(n). We just need an arch-optimized log2() function now.
383 */
384 lg = 0;
385 for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1)
386 lg++;
387
388 /* Get the phrase token into i. */
389 pt = le16_to_cpup((le16*)cb);
390
391 /*
392 * Calculate starting position of the byte sequence in
393 * the destination using the fact that p = (pt >> (12 - lg)) + 1
394 * and make sure we don't go too far back.
395 */
396 dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1;
397 if (dp_back_addr < dp_sb_start)
398 goto return_overflow;
399
400 /* Now calculate the length of the byte sequence. */
401 length = (pt & (0xfff >> lg)) + 3;
402
403 /* Advance destination position and verify it is in range. */
404 *dest_ofs += length;
405 if (*dest_ofs > do_sb_end)
406 goto return_overflow;
407
408 /* The number of non-overlapping bytes. */
409 max_non_overlap = dp_addr - dp_back_addr;
410
411 if (length <= max_non_overlap) {
412 /* The byte sequence doesn't overlap, just copy it. */
413 memcpy(dp_addr, dp_back_addr, length);
414
415 /* Advance destination pointer. */
416 dp_addr += length;
417 } else {
418 /*
419 * The byte sequence does overlap, copy non-overlapping
420 * part and then do a slow byte by byte copy for the
421 * overlapping part. Also, advance the destination
422 * pointer.
423 */
424 memcpy(dp_addr, dp_back_addr, max_non_overlap);
425 dp_addr += max_non_overlap;
426 dp_back_addr += max_non_overlap;
427 length -= max_non_overlap;
428 while (length--)
429 *dp_addr++ = *dp_back_addr++;
430 }
431
432 /* Advance source position and continue with the next token. */
433 cb += 2;
434 }
435
436 /* No tokens left in the current tag. Continue with the next tag. */
437 goto do_next_tag;
438
439return_overflow:
440 ntfs_error(NULL, "Failed. Returning -EOVERFLOW.");
441 goto return_error;
442}
443
444/**
445 * ntfs_read_compressed_block - read a compressed block into the page cache
446 * @page: locked page in the compression block(s) we need to read
447 *
448 * When we are called the page has already been verified to be locked and the
449 * attribute is known to be non-resident, not encrypted, but compressed.
450 *
451 * 1. Determine which compression block(s) @page is in.
452 * 2. Get hold of all pages corresponding to this/these compression block(s).
453 * 3. Read the (first) compression block.
454 * 4. Decompress it into the corresponding pages.
455 * 5. Throw the compressed data away and proceed to 3. for the next compression
456 * block or return success if no more compression blocks left.
457 *
458 * Warning: We have to be careful what we do about existing pages. They might
459 * have been written to so that we would lose data if we were to just overwrite
460 * them with the out-of-date uncompressed data.
461 *
462 * FIXME: For PAGE_CACHE_SIZE > cb_size we are not doing the Right Thing(TM) at
463 * the end of the file I think. We need to detect this case and zero the out
464 * of bounds remainder of the page in question and mark it as handled. At the
465 * moment we would just return -EIO on such a page. This bug will only become
466 * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte
467 * clusters so is probably not going to be seen by anyone. Still this should
468 * be fixed. (AIA)
469 *
470 * FIXME: Again for PAGE_CACHE_SIZE > cb_size we are screwing up both in
471 * handling sparse and compressed cbs. (AIA)
472 *
473 * FIXME: At the moment we don't do any zeroing out in the case that
474 * initialized_size is less than data_size. This should be safe because of the
475 * nature of the compression algorithm used. Just in case we check and output
476 * an error message in read inode if the two sizes are not equal for a
477 * compressed file. (AIA)
478 */
479int ntfs_read_compressed_block(struct page *page)
480{
481 struct address_space *mapping = page->mapping;
482 ntfs_inode *ni = NTFS_I(mapping->host);
483 ntfs_volume *vol = ni->vol;
484 struct super_block *sb = vol->sb;
485 runlist_element *rl;
486 unsigned long block_size = sb->s_blocksize;
487 unsigned char block_size_bits = sb->s_blocksize_bits;
488 u8 *cb, *cb_pos, *cb_end;
489 struct buffer_head **bhs;
490 unsigned long offset, index = page->index;
491 u32 cb_size = ni->itype.compressed.block_size;
492 u64 cb_size_mask = cb_size - 1UL;
493 VCN vcn;
494 LCN lcn;
495 /* The first wanted vcn (minimum alignment is PAGE_CACHE_SIZE). */
496 VCN start_vcn = (((s64)index << PAGE_CACHE_SHIFT) & ~cb_size_mask) >>
497 vol->cluster_size_bits;
498 /*
499 * The first vcn after the last wanted vcn (minumum alignment is again
500 * PAGE_CACHE_SIZE.
501 */
502 VCN end_vcn = ((((s64)(index + 1UL) << PAGE_CACHE_SHIFT) + cb_size - 1)
503 & ~cb_size_mask) >> vol->cluster_size_bits;
504 /* Number of compression blocks (cbs) in the wanted vcn range. */
505 unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits
506 >> ni->itype.compressed.block_size_bits;
507 /*
508 * Number of pages required to store the uncompressed data from all
509 * compression blocks (cbs) overlapping @page. Due to alignment
510 * guarantees of start_vcn and end_vcn, no need to round up here.
511 */
512 unsigned int nr_pages = (end_vcn - start_vcn) <<
513 vol->cluster_size_bits >> PAGE_CACHE_SHIFT;
514 unsigned int xpage, max_page, cur_page, cur_ofs, i;
515 unsigned int cb_clusters, cb_max_ofs;
516 int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0;
517 struct page **pages;
518 unsigned char xpage_done = 0;
519
520 ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = "
521 "%i.", index, cb_size, nr_pages);
522 /*
523 * Bad things happen if we get here for anything that is not an
524 * unnamed $DATA attribute.
525 */
526 BUG_ON(ni->type != AT_DATA);
527 BUG_ON(ni->name_len);
528
529 pages = kmalloc(nr_pages * sizeof(struct page *), GFP_NOFS);
530
531 /* Allocate memory to store the buffer heads we need. */
532 bhs_size = cb_size / block_size * sizeof(struct buffer_head *);
533 bhs = kmalloc(bhs_size, GFP_NOFS);
534
535 if (unlikely(!pages || !bhs)) {
536 kfree(bhs);
537 kfree(pages);
538 SetPageError(page);
539 unlock_page(page);
540 ntfs_error(vol->sb, "Failed to allocate internal buffers.");
541 return -ENOMEM;
542 }
543
544 /*
545 * We have already been given one page, this is the one we must do.
546 * Once again, the alignment guarantees keep it simple.
547 */
548 offset = start_vcn << vol->cluster_size_bits >> PAGE_CACHE_SHIFT;
549 xpage = index - offset;
550 pages[xpage] = page;
551 /*
552 * The remaining pages need to be allocated and inserted into the page
553 * cache, alignment guarantees keep all the below much simpler. (-8
554 */
555 max_page = ((VFS_I(ni)->i_size + PAGE_CACHE_SIZE - 1) >>
556 PAGE_CACHE_SHIFT) - offset;
557 if (nr_pages < max_page)
558 max_page = nr_pages;
559 for (i = 0; i < max_page; i++, offset++) {
560 if (i != xpage)
561 pages[i] = grab_cache_page_nowait(mapping, offset);
562 page = pages[i];
563 if (page) {
564 /*
565 * We only (re)read the page if it isn't already read
566 * in and/or dirty or we would be losing data or at
567 * least wasting our time.
568 */
569 if (!PageDirty(page) && (!PageUptodate(page) ||
570 PageError(page))) {
571 ClearPageError(page);
572 kmap(page);
573 continue;
574 }
575 unlock_page(page);
576 page_cache_release(page);
577 pages[i] = NULL;
578 }
579 }
580
581 /*
582 * We have the runlist, and all the destination pages we need to fill.
583 * Now read the first compression block.
584 */
585 cur_page = 0;
586 cur_ofs = 0;
587 cb_clusters = ni->itype.compressed.block_clusters;
588do_next_cb:
589 nr_cbs--;
590 nr_bhs = 0;
591
592 /* Read all cb buffer heads one cluster at a time. */
593 rl = NULL;
594 for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn;
595 vcn++) {
596 BOOL is_retry = FALSE;
597
598 if (!rl) {
599lock_retry_remap:
600 down_read(&ni->runlist.lock);
601 rl = ni->runlist.rl;
602 }
603 if (likely(rl != NULL)) {
604 /* Seek to element containing target vcn. */
605 while (rl->length && rl[1].vcn <= vcn)
606 rl++;
607 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
608 } else
609 lcn = LCN_RL_NOT_MAPPED;
610 ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
611 (unsigned long long)vcn,
612 (unsigned long long)lcn);
613 if (lcn < 0) {
614 /*
615 * When we reach the first sparse cluster we have
616 * finished with the cb.
617 */
618 if (lcn == LCN_HOLE)
619 break;
620 if (is_retry || lcn != LCN_RL_NOT_MAPPED)
621 goto rl_err;
622 is_retry = TRUE;
623 /*
624 * Attempt to map runlist, dropping lock for the
625 * duration.
626 */
627 up_read(&ni->runlist.lock);
628 if (!ntfs_map_runlist(ni, vcn))
629 goto lock_retry_remap;
630 goto map_rl_err;
631 }
632 block = lcn << vol->cluster_size_bits >> block_size_bits;
633 /* Read the lcn from device in chunks of block_size bytes. */
634 max_block = block + (vol->cluster_size >> block_size_bits);
635 do {
636 ntfs_debug("block = 0x%x.", block);
637 if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block))))
638 goto getblk_err;
639 nr_bhs++;
640 } while (++block < max_block);
641 }
642
643 /* Release the lock if we took it. */
644 if (rl)
645 up_read(&ni->runlist.lock);
646
647 /* Setup and initiate io on all buffer heads. */
648 for (i = 0; i < nr_bhs; i++) {
649 struct buffer_head *tbh = bhs[i];
650
651 if (unlikely(test_set_buffer_locked(tbh)))
652 continue;
653 if (unlikely(buffer_uptodate(tbh))) {
654 unlock_buffer(tbh);
655 continue;
656 }
657 get_bh(tbh);
658 tbh->b_end_io = end_buffer_read_sync;
659 submit_bh(READ, tbh);
660 }
661
662 /* Wait for io completion on all buffer heads. */
663 for (i = 0; i < nr_bhs; i++) {
664 struct buffer_head *tbh = bhs[i];
665
666 if (buffer_uptodate(tbh))
667 continue;
668 wait_on_buffer(tbh);
669 /*
670 * We need an optimization barrier here, otherwise we start
671 * hitting the below fixup code when accessing a loopback
672 * mounted ntfs partition. This indicates either there is a
673 * race condition in the loop driver or, more likely, gcc
674 * overoptimises the code without the barrier and it doesn't
675 * do the Right Thing(TM).
676 */
677 barrier();
678 if (unlikely(!buffer_uptodate(tbh))) {
679 ntfs_warning(vol->sb, "Buffer is unlocked but not "
680 "uptodate! Unplugging the disk queue "
681 "and rescheduling.");
682 get_bh(tbh);
683 blk_run_address_space(mapping);
684 schedule();
685 put_bh(tbh);
686 if (unlikely(!buffer_uptodate(tbh)))
687 goto read_err;
688 ntfs_warning(vol->sb, "Buffer is now uptodate. Good.");
689 }
690 }
691
692 /*
693 * Get the compression buffer. We must not sleep any more
694 * until we are finished with it.
695 */
696 spin_lock(&ntfs_cb_lock);
697 cb = ntfs_compression_buffer;
698
699 BUG_ON(!cb);
700
701 cb_pos = cb;
702 cb_end = cb + cb_size;
703
704 /* Copy the buffer heads into the contiguous buffer. */
705 for (i = 0; i < nr_bhs; i++) {
706 memcpy(cb_pos, bhs[i]->b_data, block_size);
707 cb_pos += block_size;
708 }
709
710 /* Just a precaution. */
711 if (cb_pos + 2 <= cb + cb_size)
712 *(u16*)cb_pos = 0;
713
714 /* Reset cb_pos back to the beginning. */
715 cb_pos = cb;
716
717 /* We now have both source (if present) and destination. */
718 ntfs_debug("Successfully read the compression block.");
719
720 /* The last page and maximum offset within it for the current cb. */
721 cb_max_page = (cur_page << PAGE_CACHE_SHIFT) + cur_ofs + cb_size;
722 cb_max_ofs = cb_max_page & ~PAGE_CACHE_MASK;
723 cb_max_page >>= PAGE_CACHE_SHIFT;
724
725 /* Catch end of file inside a compression block. */
726 if (cb_max_page > max_page)
727 cb_max_page = max_page;
728
729 if (vcn == start_vcn - cb_clusters) {
730 /* Sparse cb, zero out page range overlapping the cb. */
731 ntfs_debug("Found sparse compression block.");
732 /* We can sleep from now on, so we drop lock. */
733 spin_unlock(&ntfs_cb_lock);
734 if (cb_max_ofs)
735 cb_max_page--;
736 for (; cur_page < cb_max_page; cur_page++) {
737 page = pages[cur_page];
738 if (page) {
739 /*
740 * FIXME: Using clear_page() will become wrong
741 * when we get PAGE_CACHE_SIZE != PAGE_SIZE but
742 * for now there is no problem.
743 */
744 if (likely(!cur_ofs))
745 clear_page(page_address(page));
746 else
747 memset(page_address(page) + cur_ofs, 0,
748 PAGE_CACHE_SIZE -
749 cur_ofs);
750 flush_dcache_page(page);
751 kunmap(page);
752 SetPageUptodate(page);
753 unlock_page(page);
754 if (cur_page == xpage)
755 xpage_done = 1;
756 else
757 page_cache_release(page);
758 pages[cur_page] = NULL;
759 }
760 cb_pos += PAGE_CACHE_SIZE - cur_ofs;
761 cur_ofs = 0;
762 if (cb_pos >= cb_end)
763 break;
764 }
765 /* If we have a partial final page, deal with it now. */
766 if (cb_max_ofs && cb_pos < cb_end) {
767 page = pages[cur_page];
768 if (page)
769 memset(page_address(page) + cur_ofs, 0,
770 cb_max_ofs - cur_ofs);
771 /*
772 * No need to update cb_pos at this stage:
773 * cb_pos += cb_max_ofs - cur_ofs;
774 */
775 cur_ofs = cb_max_ofs;
776 }
777 } else if (vcn == start_vcn) {
778 /* We can't sleep so we need two stages. */
779 unsigned int cur2_page = cur_page;
780 unsigned int cur_ofs2 = cur_ofs;
781 u8 *cb_pos2 = cb_pos;
782
783 ntfs_debug("Found uncompressed compression block.");
784 /* Uncompressed cb, copy it to the destination pages. */
785 /*
786 * TODO: As a big optimization, we could detect this case
787 * before we read all the pages and use block_read_full_page()
788 * on all full pages instead (we still have to treat partial
789 * pages especially but at least we are getting rid of the
790 * synchronous io for the majority of pages.
791 * Or if we choose not to do the read-ahead/-behind stuff, we
792 * could just return block_read_full_page(pages[xpage]) as long
793 * as PAGE_CACHE_SIZE <= cb_size.
794 */
795 if (cb_max_ofs)
796 cb_max_page--;
797 /* First stage: copy data into destination pages. */
798 for (; cur_page < cb_max_page; cur_page++) {
799 page = pages[cur_page];
800 if (page)
801 memcpy(page_address(page) + cur_ofs, cb_pos,
802 PAGE_CACHE_SIZE - cur_ofs);
803 cb_pos += PAGE_CACHE_SIZE - cur_ofs;
804 cur_ofs = 0;
805 if (cb_pos >= cb_end)
806 break;
807 }
808 /* If we have a partial final page, deal with it now. */
809 if (cb_max_ofs && cb_pos < cb_end) {
810 page = pages[cur_page];
811 if (page)
812 memcpy(page_address(page) + cur_ofs, cb_pos,
813 cb_max_ofs - cur_ofs);
814 cb_pos += cb_max_ofs - cur_ofs;
815 cur_ofs = cb_max_ofs;
816 }
817 /* We can sleep from now on, so drop lock. */
818 spin_unlock(&ntfs_cb_lock);
819 /* Second stage: finalize pages. */
820 for (; cur2_page < cb_max_page; cur2_page++) {
821 page = pages[cur2_page];
822 if (page) {
823 /*
824 * If we are outside the initialized size, zero
825 * the out of bounds page range.
826 */
827 handle_bounds_compressed_page(ni, page);
828 flush_dcache_page(page);
829 kunmap(page);
830 SetPageUptodate(page);
831 unlock_page(page);
832 if (cur2_page == xpage)
833 xpage_done = 1;
834 else
835 page_cache_release(page);
836 pages[cur2_page] = NULL;
837 }
838 cb_pos2 += PAGE_CACHE_SIZE - cur_ofs2;
839 cur_ofs2 = 0;
840 if (cb_pos2 >= cb_end)
841 break;
842 }
843 } else {
844 /* Compressed cb, decompress it into the destination page(s). */
845 unsigned int prev_cur_page = cur_page;
846
847 ntfs_debug("Found compressed compression block.");
848 err = ntfs_decompress(pages, &cur_page, &cur_ofs,
849 cb_max_page, cb_max_ofs, xpage, &xpage_done,
850 cb_pos, cb_size - (cb_pos - cb));
851 /*
852 * We can sleep from now on, lock already dropped by
853 * ntfs_decompress().
854 */
855 if (err) {
856 ntfs_error(vol->sb, "ntfs_decompress() failed in inode "
857 "0x%lx with error code %i. Skipping "
858 "this compression block.",
859 ni->mft_no, -err);
860 /* Release the unfinished pages. */
861 for (; prev_cur_page < cur_page; prev_cur_page++) {
862 page = pages[prev_cur_page];
863 if (page) {
864 if (prev_cur_page == xpage &&
865 !xpage_done)
866 SetPageError(page);
867 flush_dcache_page(page);
868 kunmap(page);
869 unlock_page(page);
870 if (prev_cur_page != xpage)
871 page_cache_release(page);
872 pages[prev_cur_page] = NULL;
873 }
874 }
875 }
876 }
877
878 /* Release the buffer heads. */
879 for (i = 0; i < nr_bhs; i++)
880 brelse(bhs[i]);
881
882 /* Do we have more work to do? */
883 if (nr_cbs)
884 goto do_next_cb;
885
886 /* We no longer need the list of buffer heads. */
887 kfree(bhs);
888
889 /* Clean up if we have any pages left. Should never happen. */
890 for (cur_page = 0; cur_page < max_page; cur_page++) {
891 page = pages[cur_page];
892 if (page) {
893 ntfs_error(vol->sb, "Still have pages left! "
894 "Terminating them with extreme "
895 "prejudice. Inode 0x%lx, page index "
896 "0x%lx.", ni->mft_no, page->index);
897 if (cur_page == xpage && !xpage_done)
898 SetPageError(page);
899 flush_dcache_page(page);
900 kunmap(page);
901 unlock_page(page);
902 if (cur_page != xpage)
903 page_cache_release(page);
904 pages[cur_page] = NULL;
905 }
906 }
907
908 /* We no longer need the list of pages. */
909 kfree(pages);
910
911 /* If we have completed the requested page, we return success. */
912 if (likely(xpage_done))
913 return 0;
914
915 ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
916 "EOVERFLOW" : (!err ? "EIO" : "unkown error"));
917 return err < 0 ? err : -EIO;
918
919read_err:
920 ntfs_error(vol->sb, "IO error while reading compressed data.");
921 /* Release the buffer heads. */
922 for (i = 0; i < nr_bhs; i++)
923 brelse(bhs[i]);
924 goto err_out;
925
926map_rl_err:
927 ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read "
928 "compression block.");
929 goto err_out;
930
931rl_err:
932 up_read(&ni->runlist.lock);
933 ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read "
934 "compression block.");
935 goto err_out;
936
937getblk_err:
938 up_read(&ni->runlist.lock);
939 ntfs_error(vol->sb, "getblk() failed. Cannot read compression block.");
940
941err_out:
942 kfree(bhs);
943 for (i = cur_page; i < max_page; i++) {
944 page = pages[i];
945 if (page) {
946 if (i == xpage && !xpage_done)
947 SetPageError(page);
948 flush_dcache_page(page);
949 kunmap(page);
950 unlock_page(page);
951 if (i != xpage)
952 page_cache_release(page);
953 }
954 }
955 kfree(pages);
956 return -EIO;
957}
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
new file mode 100644
index 000000000000..6fb6bb5e3723
--- /dev/null
+++ b/fs/ntfs/debug.c
@@ -0,0 +1,180 @@
1/*
2 * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2001-2004 Anton Altaparmakov
5 *
6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published
8 * by the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program/include file is distributed in the hope that it will be
12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program (in the main directory of the Linux-NTFS
18 * distribution in the file COPYING); if not, write to the Free Software
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include "debug.h"
23
24/*
25 * A static buffer to hold the error string being displayed and a spinlock
26 * to protect concurrent accesses to it.
27 */
28static char err_buf[1024];
29static DEFINE_SPINLOCK(err_buf_lock);
30
31/**
32 * __ntfs_warning - output a warning to the syslog
33 * @function: name of function outputting the warning
34 * @sb: super block of mounted ntfs filesystem
35 * @fmt: warning string containing format specifications
36 * @...: a variable number of arguments specified in @fmt
37 *
38 * Outputs a warning to the syslog for the mounted ntfs filesystem described
39 * by @sb.
40 *
41 * @fmt and the corresponding @... is printf style format string containing
42 * the warning string and the corresponding format arguments, respectively.
43 *
44 * @function is the name of the function from which __ntfs_warning is being
45 * called.
46 *
47 * Note, you should be using debug.h::ntfs_warning(@sb, @fmt, @...) instead
48 * as this provides the @function parameter automatically.
49 */
50void __ntfs_warning(const char *function, const struct super_block *sb,
51 const char *fmt, ...)
52{
53 va_list args;
54 int flen = 0;
55
56#ifndef DEBUG
57 if (!printk_ratelimit())
58 return;
59#endif
60 if (function)
61 flen = strlen(function);
62 spin_lock(&err_buf_lock);
63 va_start(args, fmt);
64 vsnprintf(err_buf, sizeof(err_buf), fmt, args);
65 va_end(args);
66 if (sb)
67 printk(KERN_ERR "NTFS-fs warning (device %s): %s(): %s\n",
68 sb->s_id, flen ? function : "", err_buf);
69 else
70 printk(KERN_ERR "NTFS-fs warning: %s(): %s\n",
71 flen ? function : "", err_buf);
72 spin_unlock(&err_buf_lock);
73}
74
75/**
76 * __ntfs_error - output an error to the syslog
77 * @function: name of function outputting the error
78 * @sb: super block of mounted ntfs filesystem
79 * @fmt: error string containing format specifications
80 * @...: a variable number of arguments specified in @fmt
81 *
82 * Outputs an error to the syslog for the mounted ntfs filesystem described
83 * by @sb.
84 *
85 * @fmt and the corresponding @... is printf style format string containing
86 * the error string and the corresponding format arguments, respectively.
87 *
88 * @function is the name of the function from which __ntfs_error is being
89 * called.
90 *
91 * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead
92 * as this provides the @function parameter automatically.
93 */
94void __ntfs_error(const char *function, const struct super_block *sb,
95 const char *fmt, ...)
96{
97 va_list args;
98 int flen = 0;
99
100#ifndef DEBUG
101 if (!printk_ratelimit())
102 return;
103#endif
104 if (function)
105 flen = strlen(function);
106 spin_lock(&err_buf_lock);
107 va_start(args, fmt);
108 vsnprintf(err_buf, sizeof(err_buf), fmt, args);
109 va_end(args);
110 if (sb)
111 printk(KERN_ERR "NTFS-fs error (device %s): %s(): %s\n",
112 sb->s_id, flen ? function : "", err_buf);
113 else
114 printk(KERN_ERR "NTFS-fs error: %s(): %s\n",
115 flen ? function : "", err_buf);
116 spin_unlock(&err_buf_lock);
117}
118
119#ifdef DEBUG
120
121/* If 1, output debug messages, and if 0, don't. */
122int debug_msgs = 0;
123
124void __ntfs_debug (const char *file, int line, const char *function,
125 const char *fmt, ...)
126{
127 va_list args;
128 int flen = 0;
129
130 if (!debug_msgs)
131 return;
132 if (function)
133 flen = strlen(function);
134 spin_lock(&err_buf_lock);
135 va_start(args, fmt);
136 vsnprintf(err_buf, sizeof(err_buf), fmt, args);
137 va_end(args);
138 printk(KERN_DEBUG "NTFS-fs DEBUG (%s, %d): %s(): %s\n", file, line,
139 flen ? function : "", err_buf);
140 spin_unlock(&err_buf_lock);
141}
142
143/* Dump a runlist. Caller has to provide synchronisation for @rl. */
144void ntfs_debug_dump_runlist(const runlist_element *rl)
145{
146 int i;
147 const char *lcn_str[5] = { "LCN_HOLE ", "LCN_RL_NOT_MAPPED",
148 "LCN_ENOENT ", "LCN_unknown " };
149
150 if (!debug_msgs)
151 return;
152 printk(KERN_DEBUG "NTFS-fs DEBUG: Dumping runlist (values in hex):\n");
153 if (!rl) {
154 printk(KERN_DEBUG "Run list not present.\n");
155 return;
156 }
157 printk(KERN_DEBUG "VCN LCN Run length\n");
158 for (i = 0; ; i++) {
159 LCN lcn = (rl + i)->lcn;
160
161 if (lcn < (LCN)0) {
162 int index = -lcn - 1;
163
164 if (index > -LCN_ENOENT - 1)
165 index = 3;
166 printk(KERN_DEBUG "%-16Lx %s %-16Lx%s\n",
167 (rl + i)->vcn, lcn_str[index],
168 (rl + i)->length, (rl + i)->length ?
169 "" : " (runlist end)");
170 } else
171 printk(KERN_DEBUG "%-16Lx %-16Lx %-16Lx%s\n",
172 (rl + i)->vcn, (rl + i)->lcn,
173 (rl + i)->length, (rl + i)->length ?
174 "" : " (runlist end)");
175 if (!(rl + i)->length)
176 break;
177 }
178}
179
180#endif
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
new file mode 100644
index 000000000000..8ac37c33d127
--- /dev/null
+++ b/fs/ntfs/debug.h
@@ -0,0 +1,67 @@
1/*
2 * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2001-2004 Anton Altaparmakov
5 *
6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published
8 * by the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program/include file is distributed in the hope that it will be
12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program (in the main directory of the Linux-NTFS
18 * distribution in the file COPYING); if not, write to the Free Software
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#ifndef _LINUX_NTFS_DEBUG_H
23#define _LINUX_NTFS_DEBUG_H
24
25#include <linux/fs.h>
26
27#include "runlist.h"
28
29#ifdef DEBUG
30
31extern int debug_msgs;
32
33#if 0 /* Fool kernel-doc since it doesn't do macros yet */
34/**
35 * ntfs_debug - write a debug level message to syslog
36 * @f: a printf format string containing the message
37 * @...: the variables to substitute into @f
38 *
39 * ntfs_debug() writes a DEBUG level message to the syslog but only if the
40 * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP.
41 */
42static void ntfs_debug(const char *f, ...);
43#endif
44
45extern void __ntfs_debug (const char *file, int line, const char *function,
46 const char *format, ...) __attribute__ ((format (printf, 4, 5)));
47#define ntfs_debug(f, a...) \
48 __ntfs_debug(__FILE__, __LINE__, __FUNCTION__, f, ##a)
49
50extern void ntfs_debug_dump_runlist(const runlist_element *rl);
51
52#else /* !DEBUG */
53
54#define ntfs_debug(f, a...) do {} while (0)
55#define ntfs_debug_dump_runlist(rl) do {} while (0)
56
57#endif /* !DEBUG */
58
59extern void __ntfs_warning(const char *function, const struct super_block *sb,
60 const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
61#define ntfs_warning(sb, f, a...) __ntfs_warning(__FUNCTION__, sb, f, ##a)
62
63extern void __ntfs_error(const char *function, const struct super_block *sb,
64 const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
65#define ntfs_error(sb, f, a...) __ntfs_error(__FUNCTION__, sb, f, ##a)
66
67#endif /* _LINUX_NTFS_DEBUG_H */
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
new file mode 100644
index 000000000000..93577561cdbe
--- /dev/null
+++ b/fs/ntfs/dir.c
@@ -0,0 +1,1569 @@
1/**
2 * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2001-2004 Anton Altaparmakov
5 * Copyright (c) 2002 Richard Russon
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#include <linux/smp_lock.h>
24#include <linux/buffer_head.h>
25
26#include "dir.h"
27#include "aops.h"
28#include "attrib.h"
29#include "mft.h"
30#include "debug.h"
31#include "ntfs.h"
32
33/**
34 * The little endian Unicode string $I30 as a global constant.
35 */
36ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'),
37 const_cpu_to_le16('3'), const_cpu_to_le16('0'), 0 };
38
39/**
40 * ntfs_lookup_inode_by_name - find an inode in a directory given its name
41 * @dir_ni: ntfs inode of the directory in which to search for the name
42 * @uname: Unicode name for which to search in the directory
43 * @uname_len: length of the name @uname in Unicode characters
44 * @res: return the found file name if necessary (see below)
45 *
46 * Look for an inode with name @uname in the directory with inode @dir_ni.
47 * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
48 * the Unicode name. If the name is found in the directory, the corresponding
49 * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
50 * is a 64-bit number containing the sequence number.
51 *
52 * On error, a negative value is returned corresponding to the error code. In
53 * particular if the inode is not found -ENOENT is returned. Note that you
54 * can't just check the return value for being negative, you have to check the
55 * inode number for being negative which you can extract using MREC(return
56 * value).
57 *
58 * Note, @uname_len does not include the (optional) terminating NULL character.
59 *
60 * Note, we look for a case sensitive match first but we also look for a case
61 * insensitive match at the same time. If we find a case insensitive match, we
62 * save that for the case that we don't find an exact match, where we return
63 * the case insensitive match and setup @res (which we allocate!) with the mft
64 * reference, the file name type, length and with a copy of the little endian
65 * Unicode file name itself. If we match a file name which is in the DOS name
66 * space, we only return the mft reference and file name type in @res.
67 * ntfs_lookup() then uses this to find the long file name in the inode itself.
68 * This is to avoid polluting the dcache with short file names. We want them to
69 * work but we don't care for how quickly one can access them. This also fixes
70 * the dcache aliasing issues.
71 *
72 * Locking: - Caller must hold i_sem on the directory.
73 * - Each page cache page in the index allocation mapping must be
74 * locked whilst being accessed otherwise we may find a corrupt
75 * page due to it being under ->writepage at the moment which
76 * applies the mst protection fixups before writing out and then
77 * removes them again after the write is complete after which it
78 * unlocks the page.
79 */
80MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
81 const int uname_len, ntfs_name **res)
82{
83 ntfs_volume *vol = dir_ni->vol;
84 struct super_block *sb = vol->sb;
85 MFT_RECORD *m;
86 INDEX_ROOT *ir;
87 INDEX_ENTRY *ie;
88 INDEX_ALLOCATION *ia;
89 u8 *index_end;
90 u64 mref;
91 ntfs_attr_search_ctx *ctx;
92 int err, rc;
93 VCN vcn, old_vcn;
94 struct address_space *ia_mapping;
95 struct page *page;
96 u8 *kaddr;
97 ntfs_name *name = NULL;
98
99 BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode));
100 BUG_ON(NInoAttr(dir_ni));
101 /* Get hold of the mft record for the directory. */
102 m = map_mft_record(dir_ni);
103 if (IS_ERR(m)) {
104 ntfs_error(sb, "map_mft_record() failed with error code %ld.",
105 -PTR_ERR(m));
106 return ERR_MREF(PTR_ERR(m));
107 }
108 ctx = ntfs_attr_get_search_ctx(dir_ni, m);
109 if (unlikely(!ctx)) {
110 err = -ENOMEM;
111 goto err_out;
112 }
113 /* Find the index root attribute in the mft record. */
114 err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
115 0, ctx);
116 if (unlikely(err)) {
117 if (err == -ENOENT) {
118 ntfs_error(sb, "Index root attribute missing in "
119 "directory inode 0x%lx.",
120 dir_ni->mft_no);
121 err = -EIO;
122 }
123 goto err_out;
124 }
125 /* Get to the index root value (it's been verified in read_inode). */
126 ir = (INDEX_ROOT*)((u8*)ctx->attr +
127 le16_to_cpu(ctx->attr->data.resident.value_offset));
128 index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
129 /* The first index entry. */
130 ie = (INDEX_ENTRY*)((u8*)&ir->index +
131 le32_to_cpu(ir->index.entries_offset));
132 /*
133 * Loop until we exceed valid memory (corruption case) or until we
134 * reach the last entry.
135 */
136 for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
137 /* Bounds checks. */
138 if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
139 sizeof(INDEX_ENTRY_HEADER) > index_end ||
140 (u8*)ie + le16_to_cpu(ie->key_length) >
141 index_end)
142 goto dir_err_out;
143 /*
144 * The last entry cannot contain a name. It can however contain
145 * a pointer to a child node in the B+tree so we just break out.
146 */
147 if (ie->flags & INDEX_ENTRY_END)
148 break;
149 /*
150 * We perform a case sensitive comparison and if that matches
151 * we are done and return the mft reference of the inode (i.e.
152 * the inode number together with the sequence number for
153 * consistency checking). We convert it to cpu format before
154 * returning.
155 */
156 if (ntfs_are_names_equal(uname, uname_len,
157 (ntfschar*)&ie->key.file_name.file_name,
158 ie->key.file_name.file_name_length,
159 CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
160found_it:
161 /*
162 * We have a perfect match, so we don't need to care
163 * about having matched imperfectly before, so we can
164 * free name and set *res to NULL.
165 * However, if the perfect match is a short file name,
166 * we need to signal this through *res, so that
167 * ntfs_lookup() can fix dcache aliasing issues.
168 * As an optimization we just reuse an existing
169 * allocation of *res.
170 */
171 if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
172 if (!name) {
173 name = kmalloc(sizeof(ntfs_name),
174 GFP_NOFS);
175 if (!name) {
176 err = -ENOMEM;
177 goto err_out;
178 }
179 }
180 name->mref = le64_to_cpu(
181 ie->data.dir.indexed_file);
182 name->type = FILE_NAME_DOS;
183 name->len = 0;
184 *res = name;
185 } else {
186 if (name)
187 kfree(name);
188 *res = NULL;
189 }
190 mref = le64_to_cpu(ie->data.dir.indexed_file);
191 ntfs_attr_put_search_ctx(ctx);
192 unmap_mft_record(dir_ni);
193 return mref;
194 }
195 /*
196 * For a case insensitive mount, we also perform a case
197 * insensitive comparison (provided the file name is not in the
198 * POSIX namespace). If the comparison matches, and the name is
199 * in the WIN32 namespace, we cache the filename in *res so
200 * that the caller, ntfs_lookup(), can work on it. If the
201 * comparison matches, and the name is in the DOS namespace, we
202 * only cache the mft reference and the file name type (we set
203 * the name length to zero for simplicity).
204 */
205 if (!NVolCaseSensitive(vol) &&
206 ie->key.file_name.file_name_type &&
207 ntfs_are_names_equal(uname, uname_len,
208 (ntfschar*)&ie->key.file_name.file_name,
209 ie->key.file_name.file_name_length,
210 IGNORE_CASE, vol->upcase, vol->upcase_len)) {
211 int name_size = sizeof(ntfs_name);
212 u8 type = ie->key.file_name.file_name_type;
213 u8 len = ie->key.file_name.file_name_length;
214
215 /* Only one case insensitive matching name allowed. */
216 if (name) {
217 ntfs_error(sb, "Found already allocated name "
218 "in phase 1. Please run chkdsk "
219 "and if that doesn't find any "
220 "errors please report you saw "
221 "this message to "
222 "linux-ntfs-dev@lists."
223 "sourceforge.net.");
224 goto dir_err_out;
225 }
226
227 if (type != FILE_NAME_DOS)
228 name_size += len * sizeof(ntfschar);
229 name = kmalloc(name_size, GFP_NOFS);
230 if (!name) {
231 err = -ENOMEM;
232 goto err_out;
233 }
234 name->mref = le64_to_cpu(ie->data.dir.indexed_file);
235 name->type = type;
236 if (type != FILE_NAME_DOS) {
237 name->len = len;
238 memcpy(name->name, ie->key.file_name.file_name,
239 len * sizeof(ntfschar));
240 } else
241 name->len = 0;
242 *res = name;
243 }
244 /*
245 * Not a perfect match, need to do full blown collation so we
246 * know which way in the B+tree we have to go.
247 */
248 rc = ntfs_collate_names(uname, uname_len,
249 (ntfschar*)&ie->key.file_name.file_name,
250 ie->key.file_name.file_name_length, 1,
251 IGNORE_CASE, vol->upcase, vol->upcase_len);
252 /*
253 * If uname collates before the name of the current entry, there
254 * is definitely no such name in this index but we might need to
255 * descend into the B+tree so we just break out of the loop.
256 */
257 if (rc == -1)
258 break;
259 /* The names are not equal, continue the search. */
260 if (rc)
261 continue;
262 /*
263 * Names match with case insensitive comparison, now try the
264 * case sensitive comparison, which is required for proper
265 * collation.
266 */
267 rc = ntfs_collate_names(uname, uname_len,
268 (ntfschar*)&ie->key.file_name.file_name,
269 ie->key.file_name.file_name_length, 1,
270 CASE_SENSITIVE, vol->upcase, vol->upcase_len);
271 if (rc == -1)
272 break;
273 if (rc)
274 continue;
275 /*
276 * Perfect match, this will never happen as the
277 * ntfs_are_names_equal() call will have gotten a match but we
278 * still treat it correctly.
279 */
280 goto found_it;
281 }
282 /*
283 * We have finished with this index without success. Check for the
284 * presence of a child node and if not present return -ENOENT, unless
285 * we have got a matching name cached in name in which case return the
286 * mft reference associated with it.
287 */
288 if (!(ie->flags & INDEX_ENTRY_NODE)) {
289 if (name) {
290 ntfs_attr_put_search_ctx(ctx);
291 unmap_mft_record(dir_ni);
292 return name->mref;
293 }
294 ntfs_debug("Entry not found.");
295 err = -ENOENT;
296 goto err_out;
297 } /* Child node present, descend into it. */
298 /* Consistency check: Verify that an index allocation exists. */
299 if (!NInoIndexAllocPresent(dir_ni)) {
300 ntfs_error(sb, "No index allocation attribute but index entry "
301 "requires one. Directory inode 0x%lx is "
302 "corrupt or driver bug.", dir_ni->mft_no);
303 goto err_out;
304 }
305 /* Get the starting vcn of the index_block holding the child node. */
306 vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
307 ia_mapping = VFS_I(dir_ni)->i_mapping;
308 /*
309 * We are done with the index root and the mft record. Release them,
310 * otherwise we deadlock with ntfs_map_page().
311 */
312 ntfs_attr_put_search_ctx(ctx);
313 unmap_mft_record(dir_ni);
314 m = NULL;
315 ctx = NULL;
316descend_into_child_node:
317 /*
318 * Convert vcn to index into the index allocation attribute in units
319 * of PAGE_CACHE_SIZE and map the page cache page, reading it from
320 * disk if necessary.
321 */
322 page = ntfs_map_page(ia_mapping, vcn <<
323 dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
324 if (IS_ERR(page)) {
325 ntfs_error(sb, "Failed to map directory index page, error %ld.",
326 -PTR_ERR(page));
327 err = PTR_ERR(page);
328 goto err_out;
329 }
330 lock_page(page);
331 kaddr = (u8*)page_address(page);
332fast_descend_into_child_node:
333 /* Get to the index allocation block. */
334 ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
335 dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
336 /* Bounds checks. */
337 if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
338 ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
339 "inode 0x%lx or driver bug.", dir_ni->mft_no);
340 goto unm_err_out;
341 }
342 /* Catch multi sector transfer fixup errors. */
343 if (unlikely(!ntfs_is_indx_record(ia->magic))) {
344 ntfs_error(sb, "Directory index record with vcn 0x%llx is "
345 "corrupt. Corrupt inode 0x%lx. Run chkdsk.",
346 (unsigned long long)vcn, dir_ni->mft_no);
347 goto unm_err_out;
348 }
349 if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
350 ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
351 "different from expected VCN (0x%llx). "
352 "Directory inode 0x%lx is corrupt or driver "
353 "bug.", (unsigned long long)
354 sle64_to_cpu(ia->index_block_vcn),
355 (unsigned long long)vcn, dir_ni->mft_no);
356 goto unm_err_out;
357 }
358 if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
359 dir_ni->itype.index.block_size) {
360 ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
361 "0x%lx has a size (%u) differing from the "
362 "directory specified size (%u). Directory "
363 "inode is corrupt or driver bug.",
364 (unsigned long long)vcn, dir_ni->mft_no,
365 le32_to_cpu(ia->index.allocated_size) + 0x18,
366 dir_ni->itype.index.block_size);
367 goto unm_err_out;
368 }
369 index_end = (u8*)ia + dir_ni->itype.index.block_size;
370 if (index_end > kaddr + PAGE_CACHE_SIZE) {
371 ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
372 "0x%lx crosses page boundary. Impossible! "
373 "Cannot access! This is probably a bug in the "
374 "driver.", (unsigned long long)vcn,
375 dir_ni->mft_no);
376 goto unm_err_out;
377 }
378 index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
379 if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
380 ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
381 "inode 0x%lx exceeds maximum size.",
382 (unsigned long long)vcn, dir_ni->mft_no);
383 goto unm_err_out;
384 }
385 /* The first index entry. */
386 ie = (INDEX_ENTRY*)((u8*)&ia->index +
387 le32_to_cpu(ia->index.entries_offset));
388 /*
389 * Iterate similar to above big loop but applied to index buffer, thus
390 * loop until we exceed valid memory (corruption case) or until we
391 * reach the last entry.
392 */
393 for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
394 /* Bounds check. */
395 if ((u8*)ie < (u8*)ia || (u8*)ie +
396 sizeof(INDEX_ENTRY_HEADER) > index_end ||
397 (u8*)ie + le16_to_cpu(ie->key_length) >
398 index_end) {
399 ntfs_error(sb, "Index entry out of bounds in "
400 "directory inode 0x%lx.",
401 dir_ni->mft_no);
402 goto unm_err_out;
403 }
404 /*
405 * The last entry cannot contain a name. It can however contain
406 * a pointer to a child node in the B+tree so we just break out.
407 */
408 if (ie->flags & INDEX_ENTRY_END)
409 break;
410 /*
411 * We perform a case sensitive comparison and if that matches
412 * we are done and return the mft reference of the inode (i.e.
413 * the inode number together with the sequence number for
414 * consistency checking). We convert it to cpu format before
415 * returning.
416 */
417 if (ntfs_are_names_equal(uname, uname_len,
418 (ntfschar*)&ie->key.file_name.file_name,
419 ie->key.file_name.file_name_length,
420 CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
421found_it2:
422 /*
423 * We have a perfect match, so we don't need to care
424 * about having matched imperfectly before, so we can
425 * free name and set *res to NULL.
426 * However, if the perfect match is a short file name,
427 * we need to signal this through *res, so that
428 * ntfs_lookup() can fix dcache aliasing issues.
429 * As an optimization we just reuse an existing
430 * allocation of *res.
431 */
432 if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
433 if (!name) {
434 name = kmalloc(sizeof(ntfs_name),
435 GFP_NOFS);
436 if (!name) {
437 err = -ENOMEM;
438 goto unm_err_out;
439 }
440 }
441 name->mref = le64_to_cpu(
442 ie->data.dir.indexed_file);
443 name->type = FILE_NAME_DOS;
444 name->len = 0;
445 *res = name;
446 } else {
447 if (name)
448 kfree(name);
449 *res = NULL;
450 }
451 mref = le64_to_cpu(ie->data.dir.indexed_file);
452 unlock_page(page);
453 ntfs_unmap_page(page);
454 return mref;
455 }
456 /*
457 * For a case insensitive mount, we also perform a case
458 * insensitive comparison (provided the file name is not in the
459 * POSIX namespace). If the comparison matches, and the name is
460 * in the WIN32 namespace, we cache the filename in *res so
461 * that the caller, ntfs_lookup(), can work on it. If the
462 * comparison matches, and the name is in the DOS namespace, we
463 * only cache the mft reference and the file name type (we set
464 * the name length to zero for simplicity).
465 */
466 if (!NVolCaseSensitive(vol) &&
467 ie->key.file_name.file_name_type &&
468 ntfs_are_names_equal(uname, uname_len,
469 (ntfschar*)&ie->key.file_name.file_name,
470 ie->key.file_name.file_name_length,
471 IGNORE_CASE, vol->upcase, vol->upcase_len)) {
472 int name_size = sizeof(ntfs_name);
473 u8 type = ie->key.file_name.file_name_type;
474 u8 len = ie->key.file_name.file_name_length;
475
476 /* Only one case insensitive matching name allowed. */
477 if (name) {
478 ntfs_error(sb, "Found already allocated name "
479 "in phase 2. Please run chkdsk "
480 "and if that doesn't find any "
481 "errors please report you saw "
482 "this message to "
483 "linux-ntfs-dev@lists."
484 "sourceforge.net.");
485 unlock_page(page);
486 ntfs_unmap_page(page);
487 goto dir_err_out;
488 }
489
490 if (type != FILE_NAME_DOS)
491 name_size += len * sizeof(ntfschar);
492 name = kmalloc(name_size, GFP_NOFS);
493 if (!name) {
494 err = -ENOMEM;
495 goto unm_err_out;
496 }
497 name->mref = le64_to_cpu(ie->data.dir.indexed_file);
498 name->type = type;
499 if (type != FILE_NAME_DOS) {
500 name->len = len;
501 memcpy(name->name, ie->key.file_name.file_name,
502 len * sizeof(ntfschar));
503 } else
504 name->len = 0;
505 *res = name;
506 }
507 /*
508 * Not a perfect match, need to do full blown collation so we
509 * know which way in the B+tree we have to go.
510 */
511 rc = ntfs_collate_names(uname, uname_len,
512 (ntfschar*)&ie->key.file_name.file_name,
513 ie->key.file_name.file_name_length, 1,
514 IGNORE_CASE, vol->upcase, vol->upcase_len);
515 /*
516 * If uname collates before the name of the current entry, there
517 * is definitely no such name in this index but we might need to
518 * descend into the B+tree so we just break out of the loop.
519 */
520 if (rc == -1)
521 break;
522 /* The names are not equal, continue the search. */
523 if (rc)
524 continue;
525 /*
526 * Names match with case insensitive comparison, now try the
527 * case sensitive comparison, which is required for proper
528 * collation.
529 */
530 rc = ntfs_collate_names(uname, uname_len,
531 (ntfschar*)&ie->key.file_name.file_name,
532 ie->key.file_name.file_name_length, 1,
533 CASE_SENSITIVE, vol->upcase, vol->upcase_len);
534 if (rc == -1)
535 break;
536 if (rc)
537 continue;
538 /*
539 * Perfect match, this will never happen as the
540 * ntfs_are_names_equal() call will have gotten a match but we
541 * still treat it correctly.
542 */
543 goto found_it2;
544 }
545 /*
546 * We have finished with this index buffer without success. Check for
547 * the presence of a child node.
548 */
549 if (ie->flags & INDEX_ENTRY_NODE) {
550 if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
551 ntfs_error(sb, "Index entry with child node found in "
552 "a leaf node in directory inode 0x%lx.",
553 dir_ni->mft_no);
554 goto unm_err_out;
555 }
556 /* Child node present, descend into it. */
557 old_vcn = vcn;
558 vcn = sle64_to_cpup((sle64*)((u8*)ie +
559 le16_to_cpu(ie->length) - 8));
560 if (vcn >= 0) {
561 /* If vcn is in the same page cache page as old_vcn we
562 * recycle the mapped page. */
563 if (old_vcn << vol->cluster_size_bits >>
564 PAGE_CACHE_SHIFT == vcn <<
565 vol->cluster_size_bits >>
566 PAGE_CACHE_SHIFT)
567 goto fast_descend_into_child_node;
568 unlock_page(page);
569 ntfs_unmap_page(page);
570 goto descend_into_child_node;
571 }
572 ntfs_error(sb, "Negative child node vcn in directory inode "
573 "0x%lx.", dir_ni->mft_no);
574 goto unm_err_out;
575 }
576 /*
577 * No child node present, return -ENOENT, unless we have got a matching
578 * name cached in name in which case return the mft reference
579 * associated with it.
580 */
581 if (name) {
582 unlock_page(page);
583 ntfs_unmap_page(page);
584 return name->mref;
585 }
586 ntfs_debug("Entry not found.");
587 err = -ENOENT;
588unm_err_out:
589 unlock_page(page);
590 ntfs_unmap_page(page);
591err_out:
592 if (!err)
593 err = -EIO;
594 if (ctx)
595 ntfs_attr_put_search_ctx(ctx);
596 if (m)
597 unmap_mft_record(dir_ni);
598 if (name) {
599 kfree(name);
600 *res = NULL;
601 }
602 return ERR_MREF(err);
603dir_err_out:
604 ntfs_error(sb, "Corrupt directory. Aborting lookup.");
605 goto err_out;
606}
607
608#if 0
609
610// TODO: (AIA)
611// The algorithm embedded in this code will be required for the time when we
612// want to support adding of entries to directories, where we require correct
613// collation of file names in order not to cause corruption of the file system.
614
615/**
616 * ntfs_lookup_inode_by_name - find an inode in a directory given its name
617 * @dir_ni: ntfs inode of the directory in which to search for the name
618 * @uname: Unicode name for which to search in the directory
619 * @uname_len: length of the name @uname in Unicode characters
620 *
621 * Look for an inode with name @uname in the directory with inode @dir_ni.
622 * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
623 * the Unicode name. If the name is found in the directory, the corresponding
624 * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
625 * is a 64-bit number containing the sequence number.
626 *
627 * On error, a negative value is returned corresponding to the error code. In
628 * particular if the inode is not found -ENOENT is returned. Note that you
629 * can't just check the return value for being negative, you have to check the
630 * inode number for being negative which you can extract using MREC(return
631 * value).
632 *
633 * Note, @uname_len does not include the (optional) terminating NULL character.
634 */
635u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
636 const int uname_len)
637{
638 ntfs_volume *vol = dir_ni->vol;
639 struct super_block *sb = vol->sb;
640 MFT_RECORD *m;
641 INDEX_ROOT *ir;
642 INDEX_ENTRY *ie;
643 INDEX_ALLOCATION *ia;
644 u8 *index_end;
645 u64 mref;
646 ntfs_attr_search_ctx *ctx;
647 int err, rc;
648 IGNORE_CASE_BOOL ic;
649 VCN vcn, old_vcn;
650 struct address_space *ia_mapping;
651 struct page *page;
652 u8 *kaddr;
653
654 /* Get hold of the mft record for the directory. */
655 m = map_mft_record(dir_ni);
656 if (IS_ERR(m)) {
657 ntfs_error(sb, "map_mft_record() failed with error code %ld.",
658 -PTR_ERR(m));
659 return ERR_MREF(PTR_ERR(m));
660 }
661 ctx = ntfs_attr_get_search_ctx(dir_ni, m);
662 if (!ctx) {
663 err = -ENOMEM;
664 goto err_out;
665 }
666 /* Find the index root attribute in the mft record. */
667 err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
668 0, ctx);
669 if (unlikely(err)) {
670 if (err == -ENOENT) {
671 ntfs_error(sb, "Index root attribute missing in "
672 "directory inode 0x%lx.",
673 dir_ni->mft_no);
674 err = -EIO;
675 }
676 goto err_out;
677 }
678 /* Get to the index root value (it's been verified in read_inode). */
679 ir = (INDEX_ROOT*)((u8*)ctx->attr +
680 le16_to_cpu(ctx->attr->data.resident.value_offset));
681 index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
682 /* The first index entry. */
683 ie = (INDEX_ENTRY*)((u8*)&ir->index +
684 le32_to_cpu(ir->index.entries_offset));
685 /*
686 * Loop until we exceed valid memory (corruption case) or until we
687 * reach the last entry.
688 */
689 for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
690 /* Bounds checks. */
691 if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
692 sizeof(INDEX_ENTRY_HEADER) > index_end ||
693 (u8*)ie + le16_to_cpu(ie->key_length) >
694 index_end)
695 goto dir_err_out;
696 /*
697 * The last entry cannot contain a name. It can however contain
698 * a pointer to a child node in the B+tree so we just break out.
699 */
700 if (ie->flags & INDEX_ENTRY_END)
701 break;
702 /*
703 * If the current entry has a name type of POSIX, the name is
704 * case sensitive and not otherwise. This has the effect of us
705 * not being able to access any POSIX file names which collate
706 * after the non-POSIX one when they only differ in case, but
707 * anyone doing screwy stuff like that deserves to burn in
708 * hell... Doing that kind of stuff on NT4 actually causes
709 * corruption on the partition even when using SP6a and Linux
710 * is not involved at all.
711 */
712 ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
713 CASE_SENSITIVE;
714 /*
715 * If the names match perfectly, we are done and return the
716 * mft reference of the inode (i.e. the inode number together
717 * with the sequence number for consistency checking. We
718 * convert it to cpu format before returning.
719 */
720 if (ntfs_are_names_equal(uname, uname_len,
721 (ntfschar*)&ie->key.file_name.file_name,
722 ie->key.file_name.file_name_length, ic,
723 vol->upcase, vol->upcase_len)) {
724found_it:
725 mref = le64_to_cpu(ie->data.dir.indexed_file);
726 ntfs_attr_put_search_ctx(ctx);
727 unmap_mft_record(dir_ni);
728 return mref;
729 }
730 /*
731 * Not a perfect match, need to do full blown collation so we
732 * know which way in the B+tree we have to go.
733 */
734 rc = ntfs_collate_names(uname, uname_len,
735 (ntfschar*)&ie->key.file_name.file_name,
736 ie->key.file_name.file_name_length, 1,
737 IGNORE_CASE, vol->upcase, vol->upcase_len);
738 /*
739 * If uname collates before the name of the current entry, there
740 * is definitely no such name in this index but we might need to
741 * descend into the B+tree so we just break out of the loop.
742 */
743 if (rc == -1)
744 break;
745 /* The names are not equal, continue the search. */
746 if (rc)
747 continue;
748 /*
749 * Names match with case insensitive comparison, now try the
750 * case sensitive comparison, which is required for proper
751 * collation.
752 */
753 rc = ntfs_collate_names(uname, uname_len,
754 (ntfschar*)&ie->key.file_name.file_name,
755 ie->key.file_name.file_name_length, 1,
756 CASE_SENSITIVE, vol->upcase, vol->upcase_len);
757 if (rc == -1)
758 break;
759 if (rc)
760 continue;
761 /*
762 * Perfect match, this will never happen as the
763 * ntfs_are_names_equal() call will have gotten a match but we
764 * still treat it correctly.
765 */
766 goto found_it;
767 }
768 /*
769 * We have finished with this index without success. Check for the
770 * presence of a child node.
771 */
772 if (!(ie->flags & INDEX_ENTRY_NODE)) {
773 /* No child node, return -ENOENT. */
774 err = -ENOENT;
775 goto err_out;
776 } /* Child node present, descend into it. */
777 /* Consistency check: Verify that an index allocation exists. */
778 if (!NInoIndexAllocPresent(dir_ni)) {
779 ntfs_error(sb, "No index allocation attribute but index entry "
780 "requires one. Directory inode 0x%lx is "
781 "corrupt or driver bug.", dir_ni->mft_no);
782 goto err_out;
783 }
784 /* Get the starting vcn of the index_block holding the child node. */
785 vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
786 ia_mapping = VFS_I(dir_ni)->i_mapping;
787 /*
788 * We are done with the index root and the mft record. Release them,
789 * otherwise we deadlock with ntfs_map_page().
790 */
791 ntfs_attr_put_search_ctx(ctx);
792 unmap_mft_record(dir_ni);
793 m = NULL;
794 ctx = NULL;
795descend_into_child_node:
796 /*
797 * Convert vcn to index into the index allocation attribute in units
798 * of PAGE_CACHE_SIZE and map the page cache page, reading it from
799 * disk if necessary.
800 */
801 page = ntfs_map_page(ia_mapping, vcn <<
802 dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
803 if (IS_ERR(page)) {
804 ntfs_error(sb, "Failed to map directory index page, error %ld.",
805 -PTR_ERR(page));
806 err = PTR_ERR(page);
807 goto err_out;
808 }
809 lock_page(page);
810 kaddr = (u8*)page_address(page);
811fast_descend_into_child_node:
812 /* Get to the index allocation block. */
813 ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
814 dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
815 /* Bounds checks. */
816 if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
817 ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
818 "inode 0x%lx or driver bug.", dir_ni->mft_no);
819 goto unm_err_out;
820 }
821 /* Catch multi sector transfer fixup errors. */
822 if (unlikely(!ntfs_is_indx_record(ia->magic))) {
823 ntfs_error(sb, "Directory index record with vcn 0x%llx is "
824 "corrupt. Corrupt inode 0x%lx. Run chkdsk.",
825 (unsigned long long)vcn, dir_ni->mft_no);
826 goto unm_err_out;
827 }
828 if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
829 ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
830 "different from expected VCN (0x%llx). "
831 "Directory inode 0x%lx is corrupt or driver "
832 "bug.", (unsigned long long)
833 sle64_to_cpu(ia->index_block_vcn),
834 (unsigned long long)vcn, dir_ni->mft_no);
835 goto unm_err_out;
836 }
837 if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
838 dir_ni->itype.index.block_size) {
839 ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
840 "0x%lx has a size (%u) differing from the "
841 "directory specified size (%u). Directory "
842 "inode is corrupt or driver bug.",
843 (unsigned long long)vcn, dir_ni->mft_no,
844 le32_to_cpu(ia->index.allocated_size) + 0x18,
845 dir_ni->itype.index.block_size);
846 goto unm_err_out;
847 }
848 index_end = (u8*)ia + dir_ni->itype.index.block_size;
849 if (index_end > kaddr + PAGE_CACHE_SIZE) {
850 ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
851 "0x%lx crosses page boundary. Impossible! "
852 "Cannot access! This is probably a bug in the "
853 "driver.", (unsigned long long)vcn,
854 dir_ni->mft_no);
855 goto unm_err_out;
856 }
857 index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
858 if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
859 ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
860 "inode 0x%lx exceeds maximum size.",
861 (unsigned long long)vcn, dir_ni->mft_no);
862 goto unm_err_out;
863 }
864 /* The first index entry. */
865 ie = (INDEX_ENTRY*)((u8*)&ia->index +
866 le32_to_cpu(ia->index.entries_offset));
867 /*
868 * Iterate similar to above big loop but applied to index buffer, thus
869 * loop until we exceed valid memory (corruption case) or until we
870 * reach the last entry.
871 */
872 for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
873 /* Bounds check. */
874 if ((u8*)ie < (u8*)ia || (u8*)ie +
875 sizeof(INDEX_ENTRY_HEADER) > index_end ||
876 (u8*)ie + le16_to_cpu(ie->key_length) >
877 index_end) {
878 ntfs_error(sb, "Index entry out of bounds in "
879 "directory inode 0x%lx.",
880 dir_ni->mft_no);
881 goto unm_err_out;
882 }
883 /*
884 * The last entry cannot contain a name. It can however contain
885 * a pointer to a child node in the B+tree so we just break out.
886 */
887 if (ie->flags & INDEX_ENTRY_END)
888 break;
889 /*
890 * If the current entry has a name type of POSIX, the name is
891 * case sensitive and not otherwise. This has the effect of us
892 * not being able to access any POSIX file names which collate
893 * after the non-POSIX one when they only differ in case, but
894 * anyone doing screwy stuff like that deserves to burn in
895 * hell... Doing that kind of stuff on NT4 actually causes
896 * corruption on the partition even when using SP6a and Linux
897 * is not involved at all.
898 */
899 ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
900 CASE_SENSITIVE;
901 /*
902 * If the names match perfectly, we are done and return the
903 * mft reference of the inode (i.e. the inode number together
904 * with the sequence number for consistency checking. We
905 * convert it to cpu format before returning.
906 */
907 if (ntfs_are_names_equal(uname, uname_len,
908 (ntfschar*)&ie->key.file_name.file_name,
909 ie->key.file_name.file_name_length, ic,
910 vol->upcase, vol->upcase_len)) {
911found_it2:
912 mref = le64_to_cpu(ie->data.dir.indexed_file);
913 unlock_page(page);
914 ntfs_unmap_page(page);
915 return mref;
916 }
917 /*
918 * Not a perfect match, need to do full blown collation so we
919 * know which way in the B+tree we have to go.
920 */
921 rc = ntfs_collate_names(uname, uname_len,
922 (ntfschar*)&ie->key.file_name.file_name,
923 ie->key.file_name.file_name_length, 1,
924 IGNORE_CASE, vol->upcase, vol->upcase_len);
925 /*
926 * If uname collates before the name of the current entry, there
927 * is definitely no such name in this index but we might need to
928 * descend into the B+tree so we just break out of the loop.
929 */
930 if (rc == -1)
931 break;
932 /* The names are not equal, continue the search. */
933 if (rc)
934 continue;
935 /*
936 * Names match with case insensitive comparison, now try the
937 * case sensitive comparison, which is required for proper
938 * collation.
939 */
940 rc = ntfs_collate_names(uname, uname_len,
941 (ntfschar*)&ie->key.file_name.file_name,
942 ie->key.file_name.file_name_length, 1,
943 CASE_SENSITIVE, vol->upcase, vol->upcase_len);
944 if (rc == -1)
945 break;
946 if (rc)
947 continue;
948 /*
949 * Perfect match, this will never happen as the
950 * ntfs_are_names_equal() call will have gotten a match but we
951 * still treat it correctly.
952 */
953 goto found_it2;
954 }
955 /*
956 * We have finished with this index buffer without success. Check for
957 * the presence of a child node.
958 */
959 if (ie->flags & INDEX_ENTRY_NODE) {
960 if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
961 ntfs_error(sb, "Index entry with child node found in "
962 "a leaf node in directory inode 0x%lx.",
963 dir_ni->mft_no);
964 goto unm_err_out;
965 }
966 /* Child node present, descend into it. */
967 old_vcn = vcn;
968 vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
969 if (vcn >= 0) {
970 /* If vcn is in the same page cache page as old_vcn we
971 * recycle the mapped page. */
972 if (old_vcn << vol->cluster_size_bits >>
973 PAGE_CACHE_SHIFT == vcn <<
974 vol->cluster_size_bits >>
975 PAGE_CACHE_SHIFT)
976 goto fast_descend_into_child_node;
977 unlock_page(page);
978 ntfs_unmap_page(page);
979 goto descend_into_child_node;
980 }
981 ntfs_error(sb, "Negative child node vcn in directory inode "
982 "0x%lx.", dir_ni->mft_no);
983 goto unm_err_out;
984 }
985 /* No child node, return -ENOENT. */
986 ntfs_debug("Entry not found.");
987 err = -ENOENT;
988unm_err_out:
989 unlock_page(page);
990 ntfs_unmap_page(page);
991err_out:
992 if (!err)
993 err = -EIO;
994 if (ctx)
995 ntfs_attr_put_search_ctx(ctx);
996 if (m)
997 unmap_mft_record(dir_ni);
998 return ERR_MREF(err);
999dir_err_out:
1000 ntfs_error(sb, "Corrupt directory. Aborting lookup.");
1001 goto err_out;
1002}
1003
1004#endif
1005
1006/**
1007 * ntfs_filldir - ntfs specific filldir method
1008 * @vol: current ntfs volume
1009 * @fpos: position in the directory
1010 * @ndir: ntfs inode of current directory
1011 * @ia_page: page in which the index allocation buffer @ie is in resides
1012 * @ie: current index entry
1013 * @name: buffer to use for the converted name
1014 * @dirent: vfs filldir callback context
1015 * @filldir: vfs filldir callback
1016 *
1017 * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
1018 * callback.
1019 *
1020 * If @ia_page is not NULL it is the locked page containing the index
1021 * allocation block containing the index entry @ie.
1022 *
1023 * Note, we drop (and then reacquire) the page lock on @ia_page across the
1024 * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup
1025 * since ntfs_lookup() will lock the same page. As an optimization, we do not
1026 * retake the lock if we are returning a non-zero value as ntfs_readdir()
1027 * would need to drop the lock immediately anyway.
1028 */
1029static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
1030 ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
1031 u8 *name, void *dirent, filldir_t filldir)
1032{
1033 unsigned long mref;
1034 int name_len, rc;
1035 unsigned dt_type;
1036 FILE_NAME_TYPE_FLAGS name_type;
1037
1038 name_type = ie->key.file_name.file_name_type;
1039 if (name_type == FILE_NAME_DOS) {
1040 ntfs_debug("Skipping DOS name space entry.");
1041 return 0;
1042 }
1043 if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) {
1044 ntfs_debug("Skipping root directory self reference entry.");
1045 return 0;
1046 }
1047 if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user &&
1048 !NVolShowSystemFiles(vol)) {
1049 ntfs_debug("Skipping system file.");
1050 return 0;
1051 }
1052 name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name,
1053 ie->key.file_name.file_name_length, &name,
1054 NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1);
1055 if (name_len <= 0) {
1056 ntfs_debug("Skipping unrepresentable file.");
1057 return 0;
1058 }
1059 if (ie->key.file_name.file_attributes &
1060 FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT)
1061 dt_type = DT_DIR;
1062 else
1063 dt_type = DT_REG;
1064 mref = MREF_LE(ie->data.dir.indexed_file);
1065 /*
1066 * Drop the page lock otherwise we deadlock with NFS when it calls
1067 * ->lookup since ntfs_lookup() will lock the same page.
1068 */
1069 if (ia_page)
1070 unlock_page(ia_page);
1071 ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
1072 "0x%lx, DT_%s.", name, name_len, fpos, mref,
1073 dt_type == DT_DIR ? "DIR" : "REG");
1074 rc = filldir(dirent, name, name_len, fpos, mref, dt_type);
1075 /* Relock the page but not if we are aborting ->readdir. */
1076 if (!rc && ia_page)
1077 lock_page(ia_page);
1078 return rc;
1079}
1080
1081/*
1082 * We use the same basic approach as the old NTFS driver, i.e. we parse the
1083 * index root entries and then the index allocation entries that are marked
1084 * as in use in the index bitmap.
1085 *
1086 * While this will return the names in random order this doesn't matter for
1087 * ->readdir but OTOH results in a faster ->readdir.
1088 *
1089 * VFS calls ->readdir without BKL but with i_sem held. This protects the VFS
1090 * parts (e.g. ->f_pos and ->i_size, and it also protects against directory
1091 * modifications).
1092 *
1093 * Locking: - Caller must hold i_sem on the directory.
1094 * - Each page cache page in the index allocation mapping must be
1095 * locked whilst being accessed otherwise we may find a corrupt
1096 * page due to it being under ->writepage at the moment which
1097 * applies the mst protection fixups before writing out and then
1098 * removes them again after the write is complete after which it
1099 * unlocks the page.
1100 */
1101static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
1102{
1103 s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
1104 loff_t fpos;
1105 struct inode *bmp_vi, *vdir = filp->f_dentry->d_inode;
1106 struct super_block *sb = vdir->i_sb;
1107 ntfs_inode *ndir = NTFS_I(vdir);
1108 ntfs_volume *vol = NTFS_SB(sb);
1109 MFT_RECORD *m;
1110 INDEX_ROOT *ir = NULL;
1111 INDEX_ENTRY *ie;
1112 INDEX_ALLOCATION *ia;
1113 u8 *name = NULL;
1114 int rc, err, ir_pos, cur_bmp_pos;
1115 struct address_space *ia_mapping, *bmp_mapping;
1116 struct page *bmp_page = NULL, *ia_page = NULL;
1117 u8 *kaddr, *bmp, *index_end;
1118 ntfs_attr_search_ctx *ctx;
1119
1120 fpos = filp->f_pos;
1121 ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
1122 vdir->i_ino, fpos);
1123 rc = err = 0;
1124 /* Are we at end of dir yet? */
1125 if (fpos >= vdir->i_size + vol->mft_record_size)
1126 goto done;
1127 /* Emulate . and .. for all directories. */
1128 if (!fpos) {
1129 ntfs_debug("Calling filldir for . with len 1, fpos 0x0, "
1130 "inode 0x%lx, DT_DIR.", vdir->i_ino);
1131 rc = filldir(dirent, ".", 1, fpos, vdir->i_ino, DT_DIR);
1132 if (rc)
1133 goto done;
1134 fpos++;
1135 }
1136 if (fpos == 1) {
1137 ntfs_debug("Calling filldir for .. with len 2, fpos 0x1, "
1138 "inode 0x%lx, DT_DIR.",
1139 parent_ino(filp->f_dentry));
1140 rc = filldir(dirent, "..", 2, fpos,
1141 parent_ino(filp->f_dentry), DT_DIR);
1142 if (rc)
1143 goto done;
1144 fpos++;
1145 }
1146 m = NULL;
1147 ctx = NULL;
1148 /*
1149 * Allocate a buffer to store the current name being processed
1150 * converted to format determined by current NLS.
1151 */
1152 name = (u8*)kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1,
1153 GFP_NOFS);
1154 if (unlikely(!name)) {
1155 err = -ENOMEM;
1156 goto err_out;
1157 }
1158 /* Are we jumping straight into the index allocation attribute? */
1159 if (fpos >= vol->mft_record_size)
1160 goto skip_index_root;
1161 /* Get hold of the mft record for the directory. */
1162 m = map_mft_record(ndir);
1163 if (IS_ERR(m)) {
1164 err = PTR_ERR(m);
1165 m = NULL;
1166 goto err_out;
1167 }
1168 ctx = ntfs_attr_get_search_ctx(ndir, m);
1169 if (unlikely(!ctx)) {
1170 err = -ENOMEM;
1171 goto err_out;
1172 }
1173 /* Get the offset into the index root attribute. */
1174 ir_pos = (s64)fpos;
1175 /* Find the index root attribute in the mft record. */
1176 err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
1177 0, ctx);
1178 if (unlikely(err)) {
1179 ntfs_error(sb, "Index root attribute missing in directory "
1180 "inode 0x%lx.", vdir->i_ino);
1181 goto err_out;
1182 }
1183 /*
1184 * Copy the index root attribute value to a buffer so that we can put
1185 * the search context and unmap the mft record before calling the
1186 * filldir() callback. We need to do this because of NFSd which calls
1187 * ->lookup() from its filldir callback() and this causes NTFS to
1188 * deadlock as ntfs_lookup() maps the mft record of the directory and
1189 * we have got it mapped here already. The only solution is for us to
1190 * unmap the mft record here so that a call to ntfs_lookup() is able to
1191 * map the mft record without deadlocking.
1192 */
1193 rc = le32_to_cpu(ctx->attr->data.resident.value_length);
1194 ir = (INDEX_ROOT*)kmalloc(rc, GFP_NOFS);
1195 if (unlikely(!ir)) {
1196 err = -ENOMEM;
1197 goto err_out;
1198 }
1199 /* Copy the index root value (it has been verified in read_inode). */
1200 memcpy(ir, (u8*)ctx->attr +
1201 le16_to_cpu(ctx->attr->data.resident.value_offset), rc);
1202 ntfs_attr_put_search_ctx(ctx);
1203 unmap_mft_record(ndir);
1204 ctx = NULL;
1205 m = NULL;
1206 index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
1207 /* The first index entry. */
1208 ie = (INDEX_ENTRY*)((u8*)&ir->index +
1209 le32_to_cpu(ir->index.entries_offset));
1210 /*
1211 * Loop until we exceed valid memory (corruption case) or until we
1212 * reach the last entry or until filldir tells us it has had enough
1213 * or signals an error (both covered by the rc test).
1214 */
1215 for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
1216 ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir);
1217 /* Bounds checks. */
1218 if (unlikely((u8*)ie < (u8*)ir || (u8*)ie +
1219 sizeof(INDEX_ENTRY_HEADER) > index_end ||
1220 (u8*)ie + le16_to_cpu(ie->key_length) >
1221 index_end))
1222 goto err_out;
1223 /* The last entry cannot contain a name. */
1224 if (ie->flags & INDEX_ENTRY_END)
1225 break;
1226 /* Skip index root entry if continuing previous readdir. */
1227 if (ir_pos > (u8*)ie - (u8*)ir)
1228 continue;
1229 /* Advance the position even if going to skip the entry. */
1230 fpos = (u8*)ie - (u8*)ir;
1231 /* Submit the name to the filldir callback. */
1232 rc = ntfs_filldir(vol, fpos, ndir, NULL, ie, name, dirent,
1233 filldir);
1234 if (rc) {
1235 kfree(ir);
1236 goto abort;
1237 }
1238 }
1239 /* We are done with the index root and can free the buffer. */
1240 kfree(ir);
1241 ir = NULL;
1242 /* If there is no index allocation attribute we are finished. */
1243 if (!NInoIndexAllocPresent(ndir))
1244 goto EOD;
1245 /* Advance fpos to the beginning of the index allocation. */
1246 fpos = vol->mft_record_size;
1247skip_index_root:
1248 kaddr = NULL;
1249 prev_ia_pos = -1LL;
1250 /* Get the offset into the index allocation attribute. */
1251 ia_pos = (s64)fpos - vol->mft_record_size;
1252 ia_mapping = vdir->i_mapping;
1253 bmp_vi = ndir->itype.index.bmp_ino;
1254 if (unlikely(!bmp_vi)) {
1255 ntfs_debug("Inode 0x%lx, regetting index bitmap.", vdir->i_ino);
1256 bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
1257 if (IS_ERR(bmp_vi)) {
1258 ntfs_error(sb, "Failed to get bitmap attribute.");
1259 err = PTR_ERR(bmp_vi);
1260 goto err_out;
1261 }
1262 ndir->itype.index.bmp_ino = bmp_vi;
1263 }
1264 bmp_mapping = bmp_vi->i_mapping;
1265 /* Get the starting bitmap bit position and sanity check it. */
1266 bmp_pos = ia_pos >> ndir->itype.index.block_size_bits;
1267 if (unlikely(bmp_pos >> 3 >= bmp_vi->i_size)) {
1268 ntfs_error(sb, "Current index allocation position exceeds "
1269 "index bitmap size.");
1270 goto err_out;
1271 }
1272 /* Get the starting bit position in the current bitmap page. */
1273 cur_bmp_pos = bmp_pos & ((PAGE_CACHE_SIZE * 8) - 1);
1274 bmp_pos &= ~(u64)((PAGE_CACHE_SIZE * 8) - 1);
1275get_next_bmp_page:
1276 ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx",
1277 (unsigned long long)bmp_pos >> (3 + PAGE_CACHE_SHIFT),
1278 (unsigned long long)bmp_pos &
1279 (unsigned long long)((PAGE_CACHE_SIZE * 8) - 1));
1280 bmp_page = ntfs_map_page(bmp_mapping,
1281 bmp_pos >> (3 + PAGE_CACHE_SHIFT));
1282 if (IS_ERR(bmp_page)) {
1283 ntfs_error(sb, "Reading index bitmap failed.");
1284 err = PTR_ERR(bmp_page);
1285 bmp_page = NULL;
1286 goto err_out;
1287 }
1288 bmp = (u8*)page_address(bmp_page);
1289 /* Find next index block in use. */
1290 while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) {
1291find_next_index_buffer:
1292 cur_bmp_pos++;
1293 /*
1294 * If we have reached the end of the bitmap page, get the next
1295 * page, and put away the old one.
1296 */
1297 if (unlikely((cur_bmp_pos >> 3) >= PAGE_CACHE_SIZE)) {
1298 ntfs_unmap_page(bmp_page);
1299 bmp_pos += PAGE_CACHE_SIZE * 8;
1300 cur_bmp_pos = 0;
1301 goto get_next_bmp_page;
1302 }
1303 /* If we have reached the end of the bitmap, we are done. */
1304 if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= vdir->i_size))
1305 goto unm_EOD;
1306 ia_pos = (bmp_pos + cur_bmp_pos) <<
1307 ndir->itype.index.block_size_bits;
1308 }
1309 ntfs_debug("Handling index buffer 0x%llx.",
1310 (unsigned long long)bmp_pos + cur_bmp_pos);
1311 /* If the current index buffer is in the same page we reuse the page. */
1312 if ((prev_ia_pos & PAGE_CACHE_MASK) != (ia_pos & PAGE_CACHE_MASK)) {
1313 prev_ia_pos = ia_pos;
1314 if (likely(ia_page != NULL)) {
1315 unlock_page(ia_page);
1316 ntfs_unmap_page(ia_page);
1317 }
1318 /*
1319 * Map the page cache page containing the current ia_pos,
1320 * reading it from disk if necessary.
1321 */
1322 ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_CACHE_SHIFT);
1323 if (IS_ERR(ia_page)) {
1324 ntfs_error(sb, "Reading index allocation data failed.");
1325 err = PTR_ERR(ia_page);
1326 ia_page = NULL;
1327 goto err_out;
1328 }
1329 lock_page(ia_page);
1330 kaddr = (u8*)page_address(ia_page);
1331 }
1332 /* Get the current index buffer. */
1333 ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_CACHE_MASK &
1334 ~(s64)(ndir->itype.index.block_size - 1)));
1335 /* Bounds checks. */
1336 if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE)) {
1337 ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
1338 "inode 0x%lx or driver bug.", vdir->i_ino);
1339 goto err_out;
1340 }
1341 /* Catch multi sector transfer fixup errors. */
1342 if (unlikely(!ntfs_is_indx_record(ia->magic))) {
1343 ntfs_error(sb, "Directory index record with vcn 0x%llx is "
1344 "corrupt. Corrupt inode 0x%lx. Run chkdsk.",
1345 (unsigned long long)ia_pos >>
1346 ndir->itype.index.vcn_size_bits, vdir->i_ino);
1347 goto err_out;
1348 }
1349 if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos &
1350 ~(s64)(ndir->itype.index.block_size - 1)) >>
1351 ndir->itype.index.vcn_size_bits)) {
1352 ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
1353 "different from expected VCN (0x%llx). "
1354 "Directory inode 0x%lx is corrupt or driver "
1355 "bug. ", (unsigned long long)
1356 sle64_to_cpu(ia->index_block_vcn),
1357 (unsigned long long)ia_pos >>
1358 ndir->itype.index.vcn_size_bits, vdir->i_ino);
1359 goto err_out;
1360 }
1361 if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 !=
1362 ndir->itype.index.block_size)) {
1363 ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
1364 "0x%lx has a size (%u) differing from the "
1365 "directory specified size (%u). Directory "
1366 "inode is corrupt or driver bug.",
1367 (unsigned long long)ia_pos >>
1368 ndir->itype.index.vcn_size_bits, vdir->i_ino,
1369 le32_to_cpu(ia->index.allocated_size) + 0x18,
1370 ndir->itype.index.block_size);
1371 goto err_out;
1372 }
1373 index_end = (u8*)ia + ndir->itype.index.block_size;
1374 if (unlikely(index_end > kaddr + PAGE_CACHE_SIZE)) {
1375 ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
1376 "0x%lx crosses page boundary. Impossible! "
1377 "Cannot access! This is probably a bug in the "
1378 "driver.", (unsigned long long)ia_pos >>
1379 ndir->itype.index.vcn_size_bits, vdir->i_ino);
1380 goto err_out;
1381 }
1382 ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1);
1383 index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
1384 if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) {
1385 ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
1386 "inode 0x%lx exceeds maximum size.",
1387 (unsigned long long)ia_pos >>
1388 ndir->itype.index.vcn_size_bits, vdir->i_ino);
1389 goto err_out;
1390 }
1391 /* The first index entry in this index buffer. */
1392 ie = (INDEX_ENTRY*)((u8*)&ia->index +
1393 le32_to_cpu(ia->index.entries_offset));
1394 /*
1395 * Loop until we exceed valid memory (corruption case) or until we
1396 * reach the last entry or until filldir tells us it has had enough
1397 * or signals an error (both covered by the rc test).
1398 */
1399 for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
1400 ntfs_debug("In index allocation, offset 0x%llx.",
1401 (unsigned long long)ia_start +
1402 (unsigned long long)((u8*)ie - (u8*)ia));
1403 /* Bounds checks. */
1404 if (unlikely((u8*)ie < (u8*)ia || (u8*)ie +
1405 sizeof(INDEX_ENTRY_HEADER) > index_end ||
1406 (u8*)ie + le16_to_cpu(ie->key_length) >
1407 index_end))
1408 goto err_out;
1409 /* The last entry cannot contain a name. */
1410 if (ie->flags & INDEX_ENTRY_END)
1411 break;
1412 /* Skip index block entry if continuing previous readdir. */
1413 if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
1414 continue;
1415 /* Advance the position even if going to skip the entry. */
1416 fpos = (u8*)ie - (u8*)ia +
1417 (sle64_to_cpu(ia->index_block_vcn) <<
1418 ndir->itype.index.vcn_size_bits) +
1419 vol->mft_record_size;
1420 /*
1421 * Submit the name to the @filldir callback. Note,
1422 * ntfs_filldir() drops the lock on @ia_page but it retakes it
1423 * before returning, unless a non-zero value is returned in
1424 * which case the page is left unlocked.
1425 */
1426 rc = ntfs_filldir(vol, fpos, ndir, ia_page, ie, name, dirent,
1427 filldir);
1428 if (rc) {
1429 /* @ia_page is already unlocked in this case. */
1430 ntfs_unmap_page(ia_page);
1431 ntfs_unmap_page(bmp_page);
1432 goto abort;
1433 }
1434 }
1435 goto find_next_index_buffer;
1436unm_EOD:
1437 if (ia_page) {
1438 unlock_page(ia_page);
1439 ntfs_unmap_page(ia_page);
1440 }
1441 ntfs_unmap_page(bmp_page);
1442EOD:
1443 /* We are finished, set fpos to EOD. */
1444 fpos = vdir->i_size + vol->mft_record_size;
1445abort:
1446 kfree(name);
1447done:
1448#ifdef DEBUG
1449 if (!rc)
1450 ntfs_debug("EOD, fpos 0x%llx, returning 0.", fpos);
1451 else
1452 ntfs_debug("filldir returned %i, fpos 0x%llx, returning 0.",
1453 rc, fpos);
1454#endif
1455 filp->f_pos = fpos;
1456 return 0;
1457err_out:
1458 if (bmp_page)
1459 ntfs_unmap_page(bmp_page);
1460 if (ia_page) {
1461 unlock_page(ia_page);
1462 ntfs_unmap_page(ia_page);
1463 }
1464 if (ir)
1465 kfree(ir);
1466 if (name)
1467 kfree(name);
1468 if (ctx)
1469 ntfs_attr_put_search_ctx(ctx);
1470 if (m)
1471 unmap_mft_record(ndir);
1472 if (!err)
1473 err = -EIO;
1474 ntfs_debug("Failed. Returning error code %i.", -err);
1475 filp->f_pos = fpos;
1476 return err;
1477}
1478
1479/**
1480 * ntfs_dir_open - called when an inode is about to be opened
1481 * @vi: inode to be opened
1482 * @filp: file structure describing the inode
1483 *
1484 * Limit directory size to the page cache limit on architectures where unsigned
1485 * long is 32-bits. This is the most we can do for now without overflowing the
1486 * page cache page index. Doing it this way means we don't run into problems
1487 * because of existing too large directories. It would be better to allow the
1488 * user to read the accessible part of the directory but I doubt very much
1489 * anyone is going to hit this check on a 32-bit architecture, so there is no
1490 * point in adding the extra complexity required to support this.
1491 *
1492 * On 64-bit architectures, the check is hopefully optimized away by the
1493 * compiler.
1494 */
1495static int ntfs_dir_open(struct inode *vi, struct file *filp)
1496{
1497 if (sizeof(unsigned long) < 8) {
1498 if (vi->i_size > MAX_LFS_FILESIZE)
1499 return -EFBIG;
1500 }
1501 return 0;
1502}
1503
1504#ifdef NTFS_RW
1505
1506/**
1507 * ntfs_dir_fsync - sync a directory to disk
1508 * @filp: directory to be synced
1509 * @dentry: dentry describing the directory to sync
1510 * @datasync: if non-zero only flush user data and not metadata
1511 *
1512 * Data integrity sync of a directory to disk. Used for fsync, fdatasync, and
1513 * msync system calls. This function is based on file.c::ntfs_file_fsync().
1514 *
1515 * Write the mft record and all associated extent mft records as well as the
1516 * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device.
1517 *
1518 * If @datasync is true, we do not wait on the inode(s) to be written out
1519 * but we always wait on the page cache pages to be written out.
1520 *
1521 * Note: In the past @filp could be NULL so we ignore it as we don't need it
1522 * anyway.
1523 *
1524 * Locking: Caller must hold i_sem on the inode.
1525 *
1526 * TODO: We should probably also write all attribute/index inodes associated
1527 * with this inode but since we have no simple way of getting to them we ignore
1528 * this problem for now. We do write the $BITMAP attribute if it is present
1529 * which is the important one for a directory so things are not too bad.
1530 */
1531static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
1532 int datasync)
1533{
1534 struct inode *vi = dentry->d_inode;
1535 ntfs_inode *ni = NTFS_I(vi);
1536 int err, ret;
1537
1538 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
1539 BUG_ON(!S_ISDIR(vi->i_mode));
1540 if (NInoIndexAllocPresent(ni) && ni->itype.index.bmp_ino)
1541 write_inode_now(ni->itype.index.bmp_ino, !datasync);
1542 ret = ntfs_write_inode(vi, 1);
1543 write_inode_now(vi, !datasync);
1544 err = sync_blockdev(vi->i_sb->s_bdev);
1545 if (unlikely(err && !ret))
1546 ret = err;
1547 if (likely(!ret))
1548 ntfs_debug("Done.");
1549 else
1550 ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error "
1551 "%u.", datasync ? "data" : "", vi->i_ino, -ret);
1552 return ret;
1553}
1554
1555#endif /* NTFS_RW */
1556
1557struct file_operations ntfs_dir_ops = {
1558 .llseek = generic_file_llseek, /* Seek inside directory. */
1559 .read = generic_read_dir, /* Return -EISDIR. */
1560 .readdir = ntfs_readdir, /* Read directory contents. */
1561#ifdef NTFS_RW
1562 .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */
1563 /*.aio_fsync = ,*/ /* Sync all outstanding async
1564 i/o operations on a kiocb. */
1565#endif /* NTFS_RW */
1566 /*.ioctl = ,*/ /* Perform function on the
1567 mounted filesystem. */
1568 .open = ntfs_dir_open, /* Open directory. */
1569};
diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h
new file mode 100644
index 000000000000..aea7582d561f
--- /dev/null
+++ b/fs/ntfs/dir.h
@@ -0,0 +1,48 @@
1/*
2 * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of
3 * the Linux-NTFS project.
4 *
5 * Copyright (c) 2002-2004 Anton Altaparmakov
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#ifndef _LINUX_NTFS_DIR_H
24#define _LINUX_NTFS_DIR_H
25
26#include "layout.h"
27#include "inode.h"
28#include "types.h"
29
30/*
31 * ntfs_name is used to return the file name to the caller of
32 * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup())
33 * to be able to deal with dcache aliasing issues.
34 */
35typedef struct {
36 MFT_REF mref;
37 FILE_NAME_TYPE_FLAGS type;
38 u8 len;
39 ntfschar name[0];
40} __attribute__ ((__packed__)) ntfs_name;
41
42/* The little endian Unicode string $I30 as a global constant. */
43extern ntfschar I30[5];
44
45extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni,
46 const ntfschar *uname, const int uname_len, ntfs_name **res);
47
48#endif /* _LINUX_NTFS_FS_DIR_H */
diff --git a/fs/ntfs/endian.h b/fs/ntfs/endian.h
new file mode 100644
index 000000000000..927b5bf04b4f
--- /dev/null
+++ b/fs/ntfs/endian.h
@@ -0,0 +1,93 @@
1/*
2 * endian.h - Defines for endianness handling in NTFS Linux kernel driver.
3 * Part of the Linux-NTFS project.
4 *
5 * Copyright (c) 2001-2004 Anton Altaparmakov
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#ifndef _LINUX_NTFS_ENDIAN_H
24#define _LINUX_NTFS_ENDIAN_H
25
26#include <asm/byteorder.h>
27#include "types.h"
28
29/*
30 * Signed endianness conversion functions.
31 */
32
33static inline s16 sle16_to_cpu(sle16 x)
34{
35 return le16_to_cpu((__force le16)x);
36}
37
38static inline s32 sle32_to_cpu(sle32 x)
39{
40 return le32_to_cpu((__force le32)x);
41}
42
43static inline s64 sle64_to_cpu(sle64 x)
44{
45 return le64_to_cpu((__force le64)x);
46}
47
48static inline s16 sle16_to_cpup(sle16 *x)
49{
50 return le16_to_cpu(*(__force le16*)x);
51}
52
53static inline s32 sle32_to_cpup(sle32 *x)
54{
55 return le32_to_cpu(*(__force le32*)x);
56}
57
58static inline s64 sle64_to_cpup(sle64 *x)
59{
60 return le64_to_cpu(*(__force le64*)x);
61}
62
63static inline sle16 cpu_to_sle16(s16 x)
64{
65 return (__force sle16)cpu_to_le16(x);
66}
67
68static inline sle32 cpu_to_sle32(s32 x)
69{
70 return (__force sle32)cpu_to_le32(x);
71}
72
73static inline sle64 cpu_to_sle64(s64 x)
74{
75 return (__force sle64)cpu_to_le64(x);
76}
77
78static inline sle16 cpu_to_sle16p(s16 *x)
79{
80 return (__force sle16)cpu_to_le16(*x);
81}
82
83static inline sle32 cpu_to_sle32p(s32 *x)
84{
85 return (__force sle32)cpu_to_le32(*x);
86}
87
88static inline sle64 cpu_to_sle64p(s64 *x)
89{
90 return (__force sle64)cpu_to_le64(*x);
91}
92
93#endif /* _LINUX_NTFS_ENDIAN_H */
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
new file mode 100644
index 000000000000..db8713ea0d27
--- /dev/null
+++ b/fs/ntfs/file.c
@@ -0,0 +1,155 @@
1/*
2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2001-2004 Anton Altaparmakov
5 *
6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published
8 * by the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program/include file is distributed in the hope that it will be
12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program (in the main directory of the Linux-NTFS
18 * distribution in the file COPYING); if not, write to the Free Software
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include <linux/pagemap.h>
23#include <linux/buffer_head.h>
24
25#include "inode.h"
26#include "debug.h"
27#include "ntfs.h"
28
29/**
30 * ntfs_file_open - called when an inode is about to be opened
31 * @vi: inode to be opened
32 * @filp: file structure describing the inode
33 *
34 * Limit file size to the page cache limit on architectures where unsigned long
35 * is 32-bits. This is the most we can do for now without overflowing the page
36 * cache page index. Doing it this way means we don't run into problems because
37 * of existing too large files. It would be better to allow the user to read
38 * the beginning of the file but I doubt very much anyone is going to hit this
39 * check on a 32-bit architecture, so there is no point in adding the extra
40 * complexity required to support this.
41 *
42 * On 64-bit architectures, the check is hopefully optimized away by the
43 * compiler.
44 *
45 * After the check passes, just call generic_file_open() to do its work.
46 */
47static int ntfs_file_open(struct inode *vi, struct file *filp)
48{
49 if (sizeof(unsigned long) < 8) {
50 if (vi->i_size > MAX_LFS_FILESIZE)
51 return -EFBIG;
52 }
53 return generic_file_open(vi, filp);
54}
55
56#ifdef NTFS_RW
57
58/**
59 * ntfs_file_fsync - sync a file to disk
60 * @filp: file to be synced
61 * @dentry: dentry describing the file to sync
62 * @datasync: if non-zero only flush user data and not metadata
63 *
64 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync
65 * system calls. This function is inspired by fs/buffer.c::file_fsync().
66 *
67 * If @datasync is false, write the mft record and all associated extent mft
68 * records as well as the $DATA attribute and then sync the block device.
69 *
70 * If @datasync is true and the attribute is non-resident, we skip the writing
71 * of the mft record and all associated extent mft records (this might still
72 * happen due to the write_inode_now() call).
73 *
74 * Also, if @datasync is true, we do not wait on the inode to be written out
75 * but we always wait on the page cache pages to be written out.
76 *
77 * Note: In the past @filp could be NULL so we ignore it as we don't need it
78 * anyway.
79 *
80 * Locking: Caller must hold i_sem on the inode.
81 *
82 * TODO: We should probably also write all attribute/index inodes associated
83 * with this inode but since we have no simple way of getting to them we ignore
84 * this problem for now.
85 */
86static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
87 int datasync)
88{
89 struct inode *vi = dentry->d_inode;
90 int err, ret = 0;
91
92 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
93 BUG_ON(S_ISDIR(vi->i_mode));
94 if (!datasync || !NInoNonResident(NTFS_I(vi)))
95 ret = ntfs_write_inode(vi, 1);
96 write_inode_now(vi, !datasync);
97 err = sync_blockdev(vi->i_sb->s_bdev);
98 if (unlikely(err && !ret))
99 ret = err;
100 if (likely(!ret))
101 ntfs_debug("Done.");
102 else
103 ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error "
104 "%u.", datasync ? "data" : "", vi->i_ino, -ret);
105 return ret;
106}
107
108#endif /* NTFS_RW */
109
110struct file_operations ntfs_file_ops = {
111 .llseek = generic_file_llseek, /* Seek inside file. */
112 .read = generic_file_read, /* Read from file. */
113 .aio_read = generic_file_aio_read, /* Async read from file. */
114 .readv = generic_file_readv, /* Read from file. */
115#ifdef NTFS_RW
116 .write = generic_file_write, /* Write to file. */
117 .aio_write = generic_file_aio_write, /* Async write to file. */
118 .writev = generic_file_writev, /* Write to file. */
119 /*.release = ,*/ /* Last file is closed. See
120 fs/ext2/file.c::
121 ext2_release_file() for
122 how to use this to discard
123 preallocated space for
124 write opened files. */
125 .fsync = ntfs_file_fsync, /* Sync a file to disk. */
126 /*.aio_fsync = ,*/ /* Sync all outstanding async
127 i/o operations on a
128 kiocb. */
129#endif /* NTFS_RW */
130 /*.ioctl = ,*/ /* Perform function on the
131 mounted filesystem. */
132 .mmap = generic_file_mmap, /* Mmap file. */
133 .open = ntfs_file_open, /* Open file. */
134 .sendfile = generic_file_sendfile, /* Zero-copy data send with
135 the data source being on
136 the ntfs partition. We
137 do not need to care about
138 the data destination. */
139 /*.sendpage = ,*/ /* Zero-copy data send with
140 the data destination being
141 on the ntfs partition. We
142 do not need to care about
143 the data source. */
144};
145
146struct inode_operations ntfs_file_inode_ops = {
147#ifdef NTFS_RW
148 .truncate = ntfs_truncate_vfs,
149 .setattr = ntfs_setattr,
150#endif /* NTFS_RW */
151};
152
153struct file_operations ntfs_empty_file_ops = {};
154
155struct inode_operations ntfs_empty_inode_ops = {};
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
new file mode 100644
index 000000000000..71bd2cd7a4d9
--- /dev/null
+++ b/fs/ntfs/index.c
@@ -0,0 +1,461 @@
1/*
2 * index.c - NTFS kernel index handling. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2004 Anton Altaparmakov
5 *
6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published
8 * by the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program/include file is distributed in the hope that it will be
12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program (in the main directory of the Linux-NTFS
18 * distribution in the file COPYING); if not, write to the Free Software
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include "aops.h"
23#include "collate.h"
24#include "debug.h"
25#include "index.h"
26#include "ntfs.h"
27
28/**
29 * ntfs_index_ctx_get - allocate and initialize a new index context
30 * @idx_ni: ntfs index inode with which to initialize the context
31 *
32 * Allocate a new index context, initialize it with @idx_ni and return it.
33 * Return NULL if allocation failed.
34 *
35 * Locking: Caller must hold i_sem on the index inode.
36 */
37ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
38{
39 ntfs_index_context *ictx;
40
41 ictx = kmem_cache_alloc(ntfs_index_ctx_cache, SLAB_NOFS);
42 if (ictx) {
43 ictx->idx_ni = idx_ni;
44 ictx->entry = NULL;
45 ictx->data = NULL;
46 ictx->data_len = 0;
47 ictx->is_in_root = 0;
48 ictx->ir = NULL;
49 ictx->actx = NULL;
50 ictx->base_ni = NULL;
51 ictx->ia = NULL;
52 ictx->page = NULL;
53 }
54 return ictx;
55}
56
57/**
58 * ntfs_index_ctx_put - release an index context
59 * @ictx: index context to free
60 *
61 * Release the index context @ictx, releasing all associated resources.
62 *
63 * Locking: Caller must hold i_sem on the index inode.
64 */
65void ntfs_index_ctx_put(ntfs_index_context *ictx)
66{
67 if (ictx->entry) {
68 if (ictx->is_in_root) {
69 if (ictx->actx)
70 ntfs_attr_put_search_ctx(ictx->actx);
71 if (ictx->base_ni)
72 unmap_mft_record(ictx->base_ni);
73 } else {
74 struct page *page = ictx->page;
75 if (page) {
76 BUG_ON(!PageLocked(page));
77 unlock_page(page);
78 ntfs_unmap_page(page);
79 }
80 }
81 }
82 kmem_cache_free(ntfs_index_ctx_cache, ictx);
83 return;
84}
85
86/**
87 * ntfs_index_lookup - find a key in an index and return its index entry
88 * @key: [IN] key for which to search in the index
89 * @key_len: [IN] length of @key in bytes
90 * @ictx: [IN/OUT] context describing the index and the returned entry
91 *
92 * Before calling ntfs_index_lookup(), @ictx must have been obtained from a
93 * call to ntfs_index_ctx_get().
94 *
95 * Look for the @key in the index specified by the index lookup context @ictx.
96 * ntfs_index_lookup() walks the contents of the index looking for the @key.
97 *
98 * If the @key is found in the index, 0 is returned and @ictx is setup to
99 * describe the index entry containing the matching @key. @ictx->entry is the
100 * index entry and @ictx->data and @ictx->data_len are the index entry data and
101 * its length in bytes, respectively.
102 *
103 * If the @key is not found in the index, -ENOENT is returned and @ictx is
104 * setup to describe the index entry whose key collates immediately after the
105 * search @key, i.e. this is the position in the index at which an index entry
106 * with a key of @key would need to be inserted.
107 *
108 * If an error occurs return the negative error code and @ictx is left
109 * untouched.
110 *
111 * When finished with the entry and its data, call ntfs_index_ctx_put() to free
112 * the context and other associated resources.
113 *
114 * If the index entry was modified, call flush_dcache_index_entry_page()
115 * immediately after the modification and either ntfs_index_entry_mark_dirty()
116 * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
117 * ensure that the changes are written to disk.
118 *
119 * Locking: - Caller must hold i_sem on the index inode.
120 * - Each page cache page in the index allocation mapping must be
121 * locked whilst being accessed otherwise we may find a corrupt
122 * page due to it being under ->writepage at the moment which
123 * applies the mst protection fixups before writing out and then
124 * removes them again after the write is complete after which it
125 * unlocks the page.
126 */
127int ntfs_index_lookup(const void *key, const int key_len,
128 ntfs_index_context *ictx)
129{
130 VCN vcn, old_vcn;
131 ntfs_inode *idx_ni = ictx->idx_ni;
132 ntfs_volume *vol = idx_ni->vol;
133 struct super_block *sb = vol->sb;
134 ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino;
135 MFT_RECORD *m;
136 INDEX_ROOT *ir;
137 INDEX_ENTRY *ie;
138 INDEX_ALLOCATION *ia;
139 u8 *index_end, *kaddr;
140 ntfs_attr_search_ctx *actx;
141 struct address_space *ia_mapping;
142 struct page *page;
143 int rc, err = 0;
144
145 ntfs_debug("Entering.");
146 BUG_ON(!NInoAttr(idx_ni));
147 BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION);
148 BUG_ON(idx_ni->nr_extents != -1);
149 BUG_ON(!base_ni);
150 BUG_ON(!key);
151 BUG_ON(key_len <= 0);
152 if (!ntfs_is_collation_rule_supported(
153 idx_ni->itype.index.collation_rule)) {
154 ntfs_error(sb, "Index uses unsupported collation rule 0x%x. "
155 "Aborting lookup.", le32_to_cpu(
156 idx_ni->itype.index.collation_rule));
157 return -EOPNOTSUPP;
158 }
159 /* Get hold of the mft record for the index inode. */
160 m = map_mft_record(base_ni);
161 if (IS_ERR(m)) {
162 ntfs_error(sb, "map_mft_record() failed with error code %ld.",
163 -PTR_ERR(m));
164 return PTR_ERR(m);
165 }
166 actx = ntfs_attr_get_search_ctx(base_ni, m);
167 if (unlikely(!actx)) {
168 err = -ENOMEM;
169 goto err_out;
170 }
171 /* Find the index root attribute in the mft record. */
172 err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len,
173 CASE_SENSITIVE, 0, NULL, 0, actx);
174 if (unlikely(err)) {
175 if (err == -ENOENT) {
176 ntfs_error(sb, "Index root attribute missing in inode "
177 "0x%lx.", idx_ni->mft_no);
178 err = -EIO;
179 }
180 goto err_out;
181 }
182 /* Get to the index root value (it has been verified in read_inode). */
183 ir = (INDEX_ROOT*)((u8*)actx->attr +
184 le16_to_cpu(actx->attr->data.resident.value_offset));
185 index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
186 /* The first index entry. */
187 ie = (INDEX_ENTRY*)((u8*)&ir->index +
188 le32_to_cpu(ir->index.entries_offset));
189 /*
190 * Loop until we exceed valid memory (corruption case) or until we
191 * reach the last entry.
192 */
193 for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
194 /* Bounds checks. */
195 if ((u8*)ie < (u8*)actx->mrec || (u8*)ie +
196 sizeof(INDEX_ENTRY_HEADER) > index_end ||
197 (u8*)ie + le16_to_cpu(ie->length) > index_end)
198 goto idx_err_out;
199 /*
200 * The last entry cannot contain a key. It can however contain
201 * a pointer to a child node in the B+tree so we just break out.
202 */
203 if (ie->flags & INDEX_ENTRY_END)
204 break;
205 /* Further bounds checks. */
206 if ((u32)sizeof(INDEX_ENTRY_HEADER) +
207 le16_to_cpu(ie->key_length) >
208 le16_to_cpu(ie->data.vi.data_offset) ||
209 (u32)le16_to_cpu(ie->data.vi.data_offset) +
210 le16_to_cpu(ie->data.vi.data_length) >
211 le16_to_cpu(ie->length))
212 goto idx_err_out;
213 /* If the keys match perfectly, we setup @ictx and return 0. */
214 if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
215 &ie->key, key_len)) {
216ir_done:
217 ictx->is_in_root = TRUE;
218 ictx->actx = actx;
219 ictx->base_ni = base_ni;
220 ictx->ia = NULL;
221 ictx->page = NULL;
222done:
223 ictx->entry = ie;
224 ictx->data = (u8*)ie +
225 le16_to_cpu(ie->data.vi.data_offset);
226 ictx->data_len = le16_to_cpu(ie->data.vi.data_length);
227 ntfs_debug("Done.");
228 return err;
229 }
230 /*
231 * Not a perfect match, need to do full blown collation so we
232 * know which way in the B+tree we have to go.
233 */
234 rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
235 key_len, &ie->key, le16_to_cpu(ie->key_length));
236 /*
237 * If @key collates before the key of the current entry, there
238 * is definitely no such key in this index but we might need to
239 * descend into the B+tree so we just break out of the loop.
240 */
241 if (rc == -1)
242 break;
243 /*
244 * A match should never happen as the memcmp() call should have
245 * cought it, but we still treat it correctly.
246 */
247 if (!rc)
248 goto ir_done;
249 /* The keys are not equal, continue the search. */
250 }
251 /*
252 * We have finished with this index without success. Check for the
253 * presence of a child node and if not present setup @ictx and return
254 * -ENOENT.
255 */
256 if (!(ie->flags & INDEX_ENTRY_NODE)) {
257 ntfs_debug("Entry not found.");
258 err = -ENOENT;
259 goto ir_done;
260 } /* Child node present, descend into it. */
261 /* Consistency check: Verify that an index allocation exists. */
262 if (!NInoIndexAllocPresent(idx_ni)) {
263 ntfs_error(sb, "No index allocation attribute but index entry "
264 "requires one. Inode 0x%lx is corrupt or "
265 "driver bug.", idx_ni->mft_no);
266 goto err_out;
267 }
268 /* Get the starting vcn of the index_block holding the child node. */
269 vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
270 ia_mapping = VFS_I(idx_ni)->i_mapping;
271 /*
272 * We are done with the index root and the mft record. Release them,
273 * otherwise we deadlock with ntfs_map_page().
274 */
275 ntfs_attr_put_search_ctx(actx);
276 unmap_mft_record(base_ni);
277 m = NULL;
278 actx = NULL;
279descend_into_child_node:
280 /*
281 * Convert vcn to index into the index allocation attribute in units
282 * of PAGE_CACHE_SIZE and map the page cache page, reading it from
283 * disk if necessary.
284 */
285 page = ntfs_map_page(ia_mapping, vcn <<
286 idx_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
287 if (IS_ERR(page)) {
288 ntfs_error(sb, "Failed to map index page, error %ld.",
289 -PTR_ERR(page));
290 err = PTR_ERR(page);
291 goto err_out;
292 }
293 lock_page(page);
294 kaddr = (u8*)page_address(page);
295fast_descend_into_child_node:
296 /* Get to the index allocation block. */
297 ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
298 idx_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
299 /* Bounds checks. */
300 if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
301 ntfs_error(sb, "Out of bounds check failed. Corrupt inode "
302 "0x%lx or driver bug.", idx_ni->mft_no);
303 goto unm_err_out;
304 }
305 /* Catch multi sector transfer fixup errors. */
306 if (unlikely(!ntfs_is_indx_record(ia->magic))) {
307 ntfs_error(sb, "Index record with vcn 0x%llx is corrupt. "
308 "Corrupt inode 0x%lx. Run chkdsk.",
309 (long long)vcn, idx_ni->mft_no);
310 goto unm_err_out;
311 }
312 if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
313 ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
314 "different from expected VCN (0x%llx). Inode "
315 "0x%lx is corrupt or driver bug.",
316 (unsigned long long)
317 sle64_to_cpu(ia->index_block_vcn),
318 (unsigned long long)vcn, idx_ni->mft_no);
319 goto unm_err_out;
320 }
321 if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
322 idx_ni->itype.index.block_size) {
323 ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has "
324 "a size (%u) differing from the index "
325 "specified size (%u). Inode is corrupt or "
326 "driver bug.", (unsigned long long)vcn,
327 idx_ni->mft_no,
328 le32_to_cpu(ia->index.allocated_size) + 0x18,
329 idx_ni->itype.index.block_size);
330 goto unm_err_out;
331 }
332 index_end = (u8*)ia + idx_ni->itype.index.block_size;
333 if (index_end > kaddr + PAGE_CACHE_SIZE) {
334 ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx "
335 "crosses page boundary. Impossible! Cannot "
336 "access! This is probably a bug in the "
337 "driver.", (unsigned long long)vcn,
338 idx_ni->mft_no);
339 goto unm_err_out;
340 }
341 index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
342 if (index_end > (u8*)ia + idx_ni->itype.index.block_size) {
343 ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode "
344 "0x%lx exceeds maximum size.",
345 (unsigned long long)vcn, idx_ni->mft_no);
346 goto unm_err_out;
347 }
348 /* The first index entry. */
349 ie = (INDEX_ENTRY*)((u8*)&ia->index +
350 le32_to_cpu(ia->index.entries_offset));
351 /*
352 * Iterate similar to above big loop but applied to index buffer, thus
353 * loop until we exceed valid memory (corruption case) or until we
354 * reach the last entry.
355 */
356 for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
357 /* Bounds checks. */
358 if ((u8*)ie < (u8*)ia || (u8*)ie +
359 sizeof(INDEX_ENTRY_HEADER) > index_end ||
360 (u8*)ie + le16_to_cpu(ie->length) > index_end) {
361 ntfs_error(sb, "Index entry out of bounds in inode "
362 "0x%lx.", idx_ni->mft_no);
363 goto unm_err_out;
364 }
365 /*
366 * The last entry cannot contain a key. It can however contain
367 * a pointer to a child node in the B+tree so we just break out.
368 */
369 if (ie->flags & INDEX_ENTRY_END)
370 break;
371 /* Further bounds checks. */
372 if ((u32)sizeof(INDEX_ENTRY_HEADER) +
373 le16_to_cpu(ie->key_length) >
374 le16_to_cpu(ie->data.vi.data_offset) ||
375 (u32)le16_to_cpu(ie->data.vi.data_offset) +
376 le16_to_cpu(ie->data.vi.data_length) >
377 le16_to_cpu(ie->length)) {
378 ntfs_error(sb, "Index entry out of bounds in inode "
379 "0x%lx.", idx_ni->mft_no);
380 goto unm_err_out;
381 }
382 /* If the keys match perfectly, we setup @ictx and return 0. */
383 if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
384 &ie->key, key_len)) {
385ia_done:
386 ictx->is_in_root = FALSE;
387 ictx->actx = NULL;
388 ictx->base_ni = NULL;
389 ictx->ia = ia;
390 ictx->page = page;
391 goto done;
392 }
393 /*
394 * Not a perfect match, need to do full blown collation so we
395 * know which way in the B+tree we have to go.
396 */
397 rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
398 key_len, &ie->key, le16_to_cpu(ie->key_length));
399 /*
400 * If @key collates before the key of the current entry, there
401 * is definitely no such key in this index but we might need to
402 * descend into the B+tree so we just break out of the loop.
403 */
404 if (rc == -1)
405 break;
406 /*
407 * A match should never happen as the memcmp() call should have
408 * cought it, but we still treat it correctly.
409 */
410 if (!rc)
411 goto ia_done;
412 /* The keys are not equal, continue the search. */
413 }
414 /*
415 * We have finished with this index buffer without success. Check for
416 * the presence of a child node and if not present return -ENOENT.
417 */
418 if (!(ie->flags & INDEX_ENTRY_NODE)) {
419 ntfs_debug("Entry not found.");
420 err = -ENOENT;
421 goto ia_done;
422 }
423 if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
424 ntfs_error(sb, "Index entry with child node found in a leaf "
425 "node in inode 0x%lx.", idx_ni->mft_no);
426 goto unm_err_out;
427 }
428 /* Child node present, descend into it. */
429 old_vcn = vcn;
430 vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
431 if (vcn >= 0) {
432 /*
433 * If vcn is in the same page cache page as old_vcn we recycle
434 * the mapped page.
435 */
436 if (old_vcn << vol->cluster_size_bits >>
437 PAGE_CACHE_SHIFT == vcn <<
438 vol->cluster_size_bits >>
439 PAGE_CACHE_SHIFT)
440 goto fast_descend_into_child_node;
441 unlock_page(page);
442 ntfs_unmap_page(page);
443 goto descend_into_child_node;
444 }
445 ntfs_error(sb, "Negative child node vcn in inode 0x%lx.",
446 idx_ni->mft_no);
447unm_err_out:
448 unlock_page(page);
449 ntfs_unmap_page(page);
450err_out:
451 if (!err)
452 err = -EIO;
453 if (actx)
454 ntfs_attr_put_search_ctx(actx);
455 if (m)
456 unmap_mft_record(base_ni);
457 return err;
458idx_err_out:
459 ntfs_error(sb, "Corrupt index. Aborting lookup.");
460 goto err_out;
461}
diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h
new file mode 100644
index 000000000000..846a489e8692
--- /dev/null
+++ b/fs/ntfs/index.h
@@ -0,0 +1,148 @@
1/*
2 * index.h - Defines for NTFS kernel index handling. Part of the Linux-NTFS
3 * project.
4 *
5 * Copyright (c) 2004 Anton Altaparmakov
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#ifndef _LINUX_NTFS_INDEX_H
24#define _LINUX_NTFS_INDEX_H
25
26#include <linux/fs.h>
27
28#include "types.h"
29#include "layout.h"
30#include "inode.h"
31#include "attrib.h"
32#include "mft.h"
33#include "aops.h"
34
35/**
36 * @idx_ni: index inode containing the @entry described by this context
37 * @entry: index entry (points into @ir or @ia)
38 * @data: index entry data (points into @entry)
39 * @data_len: length in bytes of @data
40 * @is_in_root: TRUE if @entry is in @ir and FALSE if it is in @ia
41 * @ir: index root if @is_in_root and NULL otherwise
42 * @actx: attribute search context if @is_in_root and NULL otherwise
43 * @base_ni: base inode if @is_in_root and NULL otherwise
44 * @ia: index block if @is_in_root is FALSE and NULL otherwise
45 * @page: page if @is_in_root is FALSE and NULL otherwise
46 *
47 * @idx_ni is the index inode this context belongs to.
48 *
49 * @entry is the index entry described by this context. @data and @data_len
50 * are the index entry data and its length in bytes, respectively. @data
51 * simply points into @entry. This is probably what the user is interested in.
52 *
53 * If @is_in_root is TRUE, @entry is in the index root attribute @ir described
54 * by the attribute search context @actx and the base inode @base_ni. @ia and
55 * @page are NULL in this case.
56 *
57 * If @is_in_root is FALSE, @entry is in the index allocation attribute and @ia
58 * and @page point to the index allocation block and the mapped, locked page it
59 * is in, respectively. @ir, @actx and @base_ni are NULL in this case.
60 *
61 * To obtain a context call ntfs_index_ctx_get().
62 *
63 * We use this context to allow ntfs_index_lookup() to return the found index
64 * @entry and its @data without having to allocate a buffer and copy the @entry
65 * and/or its @data into it.
66 *
67 * When finished with the @entry and its @data, call ntfs_index_ctx_put() to
68 * free the context and other associated resources.
69 *
70 * If the index entry was modified, call flush_dcache_index_entry_page()
71 * immediately after the modification and either ntfs_index_entry_mark_dirty()
72 * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
73 * ensure that the changes are written to disk.
74 */
75typedef struct {
76 ntfs_inode *idx_ni;
77 INDEX_ENTRY *entry;
78 void *data;
79 u16 data_len;
80 BOOL is_in_root;
81 INDEX_ROOT *ir;
82 ntfs_attr_search_ctx *actx;
83 ntfs_inode *base_ni;
84 INDEX_ALLOCATION *ia;
85 struct page *page;
86} ntfs_index_context;
87
88extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni);
89extern void ntfs_index_ctx_put(ntfs_index_context *ictx);
90
91extern int ntfs_index_lookup(const void *key, const int key_len,
92 ntfs_index_context *ictx);
93
94#ifdef NTFS_RW
95
96/**
97 * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries
98 * @ictx: ntfs index context describing the index entry
99 *
100 * Call flush_dcache_page() for the page in which an index entry resides.
101 *
102 * This must be called every time an index entry is modified, just after the
103 * modification.
104 *
105 * If the index entry is in the index root attribute, simply flush the page
106 * containing the mft record containing the index root attribute.
107 *
108 * If the index entry is in an index block belonging to the index allocation
109 * attribute, simply flush the page cache page containing the index block.
110 */
111static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx)
112{
113 if (ictx->is_in_root)
114 flush_dcache_mft_record_page(ictx->actx->ntfs_ino);
115 else
116 flush_dcache_page(ictx->page);
117}
118
119/**
120 * ntfs_index_entry_mark_dirty - mark an index entry dirty
121 * @ictx: ntfs index context describing the index entry
122 *
123 * Mark the index entry described by the index entry context @ictx dirty.
124 *
125 * If the index entry is in the index root attribute, simply mark the mft
126 * record containing the index root attribute dirty. This ensures the mft
127 * record, and hence the index root attribute, will be written out to disk
128 * later.
129 *
130 * If the index entry is in an index block belonging to the index allocation
131 * attribute, mark the buffers belonging to the index record as well as the
132 * page cache page the index block is in dirty. This automatically marks the
133 * VFS inode of the ntfs index inode to which the index entry belongs dirty,
134 * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the
135 * dirty index block, will be written out to disk later.
136 */
137static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx)
138{
139 if (ictx->is_in_root)
140 mark_mft_record_dirty(ictx->actx->ntfs_ino);
141 else
142 mark_ntfs_record_dirty(ictx->page,
143 (u8*)ictx->ia - (u8*)page_address(ictx->page));
144}
145
146#endif /* NTFS_RW */
147
148#endif /* _LINUX_NTFS_INDEX_H */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
new file mode 100644
index 000000000000..31840ba0b38c
--- /dev/null
+++ b/fs/ntfs/inode.c
@@ -0,0 +1,2616 @@
1/**
2 * inode.c - NTFS kernel inode handling. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2001-2004 Anton Altaparmakov
5 *
6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published
8 * by the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program/include file is distributed in the hope that it will be
12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program (in the main directory of the Linux-NTFS
18 * distribution in the file COPYING); if not, write to the Free Software
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include <linux/pagemap.h>
23#include <linux/buffer_head.h>
24#include <linux/smp_lock.h>
25#include <linux/quotaops.h>
26#include <linux/mount.h>
27
28#include "aops.h"
29#include "dir.h"
30#include "debug.h"
31#include "inode.h"
32#include "attrib.h"
33#include "malloc.h"
34#include "mft.h"
35#include "time.h"
36#include "ntfs.h"
37
38/**
39 * ntfs_test_inode - compare two (possibly fake) inodes for equality
40 * @vi: vfs inode which to test
41 * @na: ntfs attribute which is being tested with
42 *
43 * Compare the ntfs attribute embedded in the ntfs specific part of the vfs
44 * inode @vi for equality with the ntfs attribute @na.
45 *
46 * If searching for the normal file/directory inode, set @na->type to AT_UNUSED.
47 * @na->name and @na->name_len are then ignored.
48 *
49 * Return 1 if the attributes match and 0 if not.
50 *
51 * NOTE: This function runs with the inode_lock spin lock held so it is not
52 * allowed to sleep.
53 */
54int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
55{
56 ntfs_inode *ni;
57
58 if (vi->i_ino != na->mft_no)
59 return 0;
60 ni = NTFS_I(vi);
61 /* If !NInoAttr(ni), @vi is a normal file or directory inode. */
62 if (likely(!NInoAttr(ni))) {
63 /* If not looking for a normal inode this is a mismatch. */
64 if (unlikely(na->type != AT_UNUSED))
65 return 0;
66 } else {
67 /* A fake inode describing an attribute. */
68 if (ni->type != na->type)
69 return 0;
70 if (ni->name_len != na->name_len)
71 return 0;
72 if (na->name_len && memcmp(ni->name, na->name,
73 na->name_len * sizeof(ntfschar)))
74 return 0;
75 }
76 /* Match! */
77 return 1;
78}
79
80/**
81 * ntfs_init_locked_inode - initialize an inode
82 * @vi: vfs inode to initialize
83 * @na: ntfs attribute which to initialize @vi to
84 *
85 * Initialize the vfs inode @vi with the values from the ntfs attribute @na in
86 * order to enable ntfs_test_inode() to do its work.
87 *
88 * If initializing the normal file/directory inode, set @na->type to AT_UNUSED.
89 * In that case, @na->name and @na->name_len should be set to NULL and 0,
90 * respectively. Although that is not strictly necessary as
91 * ntfs_read_inode_locked() will fill them in later.
92 *
93 * Return 0 on success and -errno on error.
94 *
95 * NOTE: This function runs with the inode_lock spin lock held so it is not
96 * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
97 */
98static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
99{
100 ntfs_inode *ni = NTFS_I(vi);
101
102 vi->i_ino = na->mft_no;
103
104 ni->type = na->type;
105 if (na->type == AT_INDEX_ALLOCATION)
106 NInoSetMstProtected(ni);
107
108 ni->name = na->name;
109 ni->name_len = na->name_len;
110
111 /* If initializing a normal inode, we are done. */
112 if (likely(na->type == AT_UNUSED)) {
113 BUG_ON(na->name);
114 BUG_ON(na->name_len);
115 return 0;
116 }
117
118 /* It is a fake inode. */
119 NInoSetAttr(ni);
120
121 /*
122 * We have I30 global constant as an optimization as it is the name
123 * in >99.9% of named attributes! The other <0.1% incur a GFP_ATOMIC
124 * allocation but that is ok. And most attributes are unnamed anyway,
125 * thus the fraction of named attributes with name != I30 is actually
126 * absolutely tiny.
127 */
128 if (na->name_len && na->name != I30) {
129 unsigned int i;
130
131 BUG_ON(!na->name);
132 i = na->name_len * sizeof(ntfschar);
133 ni->name = (ntfschar*)kmalloc(i + sizeof(ntfschar), GFP_ATOMIC);
134 if (!ni->name)
135 return -ENOMEM;
136 memcpy(ni->name, na->name, i);
137 ni->name[i] = 0;
138 }
139 return 0;
140}
141
142typedef int (*set_t)(struct inode *, void *);
143static int ntfs_read_locked_inode(struct inode *vi);
144static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi);
145static int ntfs_read_locked_index_inode(struct inode *base_vi,
146 struct inode *vi);
147
148/**
149 * ntfs_iget - obtain a struct inode corresponding to a specific normal inode
150 * @sb: super block of mounted volume
151 * @mft_no: mft record number / inode number to obtain
152 *
153 * Obtain the struct inode corresponding to a specific normal inode (i.e. a
154 * file or directory).
155 *
156 * If the inode is in the cache, it is just returned with an increased
157 * reference count. Otherwise, a new struct inode is allocated and initialized,
158 * and finally ntfs_read_locked_inode() is called to read in the inode and
159 * fill in the remainder of the inode structure.
160 *
161 * Return the struct inode on success. Check the return value with IS_ERR() and
162 * if true, the function failed and the error code is obtained from PTR_ERR().
163 */
164struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no)
165{
166 struct inode *vi;
167 ntfs_attr na;
168 int err;
169
170 na.mft_no = mft_no;
171 na.type = AT_UNUSED;
172 na.name = NULL;
173 na.name_len = 0;
174
175 vi = iget5_locked(sb, mft_no, (test_t)ntfs_test_inode,
176 (set_t)ntfs_init_locked_inode, &na);
177 if (!vi)
178 return ERR_PTR(-ENOMEM);
179
180 err = 0;
181
182 /* If this is a freshly allocated inode, need to read it now. */
183 if (vi->i_state & I_NEW) {
184 err = ntfs_read_locked_inode(vi);
185 unlock_new_inode(vi);
186 }
187 /*
188 * There is no point in keeping bad inodes around if the failure was
189 * due to ENOMEM. We want to be able to retry again later.
190 */
191 if (err == -ENOMEM) {
192 iput(vi);
193 vi = ERR_PTR(err);
194 }
195 return vi;
196}
197
198/**
199 * ntfs_attr_iget - obtain a struct inode corresponding to an attribute
200 * @base_vi: vfs base inode containing the attribute
201 * @type: attribute type
202 * @name: Unicode name of the attribute (NULL if unnamed)
203 * @name_len: length of @name in Unicode characters (0 if unnamed)
204 *
205 * Obtain the (fake) struct inode corresponding to the attribute specified by
206 * @type, @name, and @name_len, which is present in the base mft record
207 * specified by the vfs inode @base_vi.
208 *
209 * If the attribute inode is in the cache, it is just returned with an
210 * increased reference count. Otherwise, a new struct inode is allocated and
211 * initialized, and finally ntfs_read_locked_attr_inode() is called to read the
212 * attribute and fill in the inode structure.
213 *
214 * Note, for index allocation attributes, you need to use ntfs_index_iget()
215 * instead of ntfs_attr_iget() as working with indices is a lot more complex.
216 *
217 * Return the struct inode of the attribute inode on success. Check the return
218 * value with IS_ERR() and if true, the function failed and the error code is
219 * obtained from PTR_ERR().
220 */
221struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
222 ntfschar *name, u32 name_len)
223{
224 struct inode *vi;
225 ntfs_attr na;
226 int err;
227
228 /* Make sure no one calls ntfs_attr_iget() for indices. */
229 BUG_ON(type == AT_INDEX_ALLOCATION);
230
231 na.mft_no = base_vi->i_ino;
232 na.type = type;
233 na.name = name;
234 na.name_len = name_len;
235
236 vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode,
237 (set_t)ntfs_init_locked_inode, &na);
238 if (!vi)
239 return ERR_PTR(-ENOMEM);
240
241 err = 0;
242
243 /* If this is a freshly allocated inode, need to read it now. */
244 if (vi->i_state & I_NEW) {
245 err = ntfs_read_locked_attr_inode(base_vi, vi);
246 unlock_new_inode(vi);
247 }
248 /*
249 * There is no point in keeping bad attribute inodes around. This also
250 * simplifies things in that we never need to check for bad attribute
251 * inodes elsewhere.
252 */
253 if (err) {
254 iput(vi);
255 vi = ERR_PTR(err);
256 }
257 return vi;
258}
259
260/**
261 * ntfs_index_iget - obtain a struct inode corresponding to an index
262 * @base_vi: vfs base inode containing the index related attributes
263 * @name: Unicode name of the index
264 * @name_len: length of @name in Unicode characters
265 *
266 * Obtain the (fake) struct inode corresponding to the index specified by @name
267 * and @name_len, which is present in the base mft record specified by the vfs
268 * inode @base_vi.
269 *
270 * If the index inode is in the cache, it is just returned with an increased
271 * reference count. Otherwise, a new struct inode is allocated and
272 * initialized, and finally ntfs_read_locked_index_inode() is called to read
273 * the index related attributes and fill in the inode structure.
274 *
275 * Return the struct inode of the index inode on success. Check the return
276 * value with IS_ERR() and if true, the function failed and the error code is
277 * obtained from PTR_ERR().
278 */
279struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
280 u32 name_len)
281{
282 struct inode *vi;
283 ntfs_attr na;
284 int err;
285
286 na.mft_no = base_vi->i_ino;
287 na.type = AT_INDEX_ALLOCATION;
288 na.name = name;
289 na.name_len = name_len;
290
291 vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode,
292 (set_t)ntfs_init_locked_inode, &na);
293 if (!vi)
294 return ERR_PTR(-ENOMEM);
295
296 err = 0;
297
298 /* If this is a freshly allocated inode, need to read it now. */
299 if (vi->i_state & I_NEW) {
300 err = ntfs_read_locked_index_inode(base_vi, vi);
301 unlock_new_inode(vi);
302 }
303 /*
304 * There is no point in keeping bad index inodes around. This also
305 * simplifies things in that we never need to check for bad index
306 * inodes elsewhere.
307 */
308 if (err) {
309 iput(vi);
310 vi = ERR_PTR(err);
311 }
312 return vi;
313}
314
315struct inode *ntfs_alloc_big_inode(struct super_block *sb)
316{
317 ntfs_inode *ni;
318
319 ntfs_debug("Entering.");
320 ni = (ntfs_inode *)kmem_cache_alloc(ntfs_big_inode_cache,
321 SLAB_NOFS);
322 if (likely(ni != NULL)) {
323 ni->state = 0;
324 return VFS_I(ni);
325 }
326 ntfs_error(sb, "Allocation of NTFS big inode structure failed.");
327 return NULL;
328}
329
330void ntfs_destroy_big_inode(struct inode *inode)
331{
332 ntfs_inode *ni = NTFS_I(inode);
333
334 ntfs_debug("Entering.");
335 BUG_ON(ni->page);
336 if (!atomic_dec_and_test(&ni->count))
337 BUG();
338 kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
339}
340
341static inline ntfs_inode *ntfs_alloc_extent_inode(void)
342{
343 ntfs_inode *ni;
344
345 ntfs_debug("Entering.");
346 ni = (ntfs_inode *)kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS);
347 if (likely(ni != NULL)) {
348 ni->state = 0;
349 return ni;
350 }
351 ntfs_error(NULL, "Allocation of NTFS inode structure failed.");
352 return NULL;
353}
354
355static void ntfs_destroy_extent_inode(ntfs_inode *ni)
356{
357 ntfs_debug("Entering.");
358 BUG_ON(ni->page);
359 if (!atomic_dec_and_test(&ni->count))
360 BUG();
361 kmem_cache_free(ntfs_inode_cache, ni);
362}
363
364/**
365 * __ntfs_init_inode - initialize ntfs specific part of an inode
366 * @sb: super block of mounted volume
367 * @ni: freshly allocated ntfs inode which to initialize
368 *
369 * Initialize an ntfs inode to defaults.
370 *
371 * NOTE: ni->mft_no, ni->state, ni->type, ni->name, and ni->name_len are left
372 * untouched. Make sure to initialize them elsewhere.
373 *
374 * Return zero on success and -ENOMEM on error.
375 */
376void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni)
377{
378 ntfs_debug("Entering.");
379 ni->initialized_size = ni->allocated_size = 0;
380 ni->seq_no = 0;
381 atomic_set(&ni->count, 1);
382 ni->vol = NTFS_SB(sb);
383 ntfs_init_runlist(&ni->runlist);
384 init_MUTEX(&ni->mrec_lock);
385 ni->page = NULL;
386 ni->page_ofs = 0;
387 ni->attr_list_size = 0;
388 ni->attr_list = NULL;
389 ntfs_init_runlist(&ni->attr_list_rl);
390 ni->itype.index.bmp_ino = NULL;
391 ni->itype.index.block_size = 0;
392 ni->itype.index.vcn_size = 0;
393 ni->itype.index.collation_rule = 0;
394 ni->itype.index.block_size_bits = 0;
395 ni->itype.index.vcn_size_bits = 0;
396 init_MUTEX(&ni->extent_lock);
397 ni->nr_extents = 0;
398 ni->ext.base_ntfs_ino = NULL;
399}
400
401inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
402 unsigned long mft_no)
403{
404 ntfs_inode *ni = ntfs_alloc_extent_inode();
405
406 ntfs_debug("Entering.");
407 if (likely(ni != NULL)) {
408 __ntfs_init_inode(sb, ni);
409 ni->mft_no = mft_no;
410 ni->type = AT_UNUSED;
411 ni->name = NULL;
412 ni->name_len = 0;
413 }
414 return ni;
415}
416
417/**
418 * ntfs_is_extended_system_file - check if a file is in the $Extend directory
419 * @ctx: initialized attribute search context
420 *
421 * Search all file name attributes in the inode described by the attribute
422 * search context @ctx and check if any of the names are in the $Extend system
423 * directory.
424 *
425 * Return values:
426 * 1: file is in $Extend directory
427 * 0: file is not in $Extend directory
428 * -errno: failed to determine if the file is in the $Extend directory
429 */
430static int ntfs_is_extended_system_file(ntfs_attr_search_ctx *ctx)
431{
432 int nr_links, err;
433
434 /* Restart search. */
435 ntfs_attr_reinit_search_ctx(ctx);
436
437 /* Get number of hard links. */
438 nr_links = le16_to_cpu(ctx->mrec->link_count);
439
440 /* Loop through all hard links. */
441 while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0,
442 ctx))) {
443 FILE_NAME_ATTR *file_name_attr;
444 ATTR_RECORD *attr = ctx->attr;
445 u8 *p, *p2;
446
447 nr_links--;
448 /*
449 * Maximum sanity checking as we are called on an inode that
450 * we suspect might be corrupt.
451 */
452 p = (u8*)attr + le32_to_cpu(attr->length);
453 if (p < (u8*)ctx->mrec || (u8*)p > (u8*)ctx->mrec +
454 le32_to_cpu(ctx->mrec->bytes_in_use)) {
455err_corrupt_attr:
456 ntfs_error(ctx->ntfs_ino->vol->sb, "Corrupt file name "
457 "attribute. You should run chkdsk.");
458 return -EIO;
459 }
460 if (attr->non_resident) {
461 ntfs_error(ctx->ntfs_ino->vol->sb, "Non-resident file "
462 "name. You should run chkdsk.");
463 return -EIO;
464 }
465 if (attr->flags) {
466 ntfs_error(ctx->ntfs_ino->vol->sb, "File name with "
467 "invalid flags. You should run "
468 "chkdsk.");
469 return -EIO;
470 }
471 if (!(attr->data.resident.flags & RESIDENT_ATTR_IS_INDEXED)) {
472 ntfs_error(ctx->ntfs_ino->vol->sb, "Unindexed file "
473 "name. You should run chkdsk.");
474 return -EIO;
475 }
476 file_name_attr = (FILE_NAME_ATTR*)((u8*)attr +
477 le16_to_cpu(attr->data.resident.value_offset));
478 p2 = (u8*)attr + le32_to_cpu(attr->data.resident.value_length);
479 if (p2 < (u8*)attr || p2 > p)
480 goto err_corrupt_attr;
481 /* This attribute is ok, but is it in the $Extend directory? */
482 if (MREF_LE(file_name_attr->parent_directory) == FILE_Extend)
483 return 1; /* YES, it's an extended system file. */
484 }
485 if (unlikely(err != -ENOENT))
486 return err;
487 if (unlikely(nr_links)) {
488 ntfs_error(ctx->ntfs_ino->vol->sb, "Inode hard link count "
489 "doesn't match number of name attributes. You "
490 "should run chkdsk.");
491 return -EIO;
492 }
493 return 0; /* NO, it is not an extended system file. */
494}
495
496/**
497 * ntfs_read_locked_inode - read an inode from its device
498 * @vi: inode to read
499 *
500 * ntfs_read_locked_inode() is called from ntfs_iget() to read the inode
501 * described by @vi into memory from the device.
502 *
503 * The only fields in @vi that we need to/can look at when the function is
504 * called are i_sb, pointing to the mounted device's super block, and i_ino,
505 * the number of the inode to load.
506 *
507 * ntfs_read_locked_inode() maps, pins and locks the mft record number i_ino
508 * for reading and sets up the necessary @vi fields as well as initializing
509 * the ntfs inode.
510 *
511 * Q: What locks are held when the function is called?
512 * A: i_state has I_LOCK set, hence the inode is locked, also
513 * i_count is set to 1, so it is not going to go away
514 * i_flags is set to 0 and we have no business touching it. Only an ioctl()
515 * is allowed to write to them. We should of course be honouring them but
516 * we need to do that using the IS_* macros defined in include/linux/fs.h.
517 * In any case ntfs_read_locked_inode() has nothing to do with i_flags.
518 *
519 * Return 0 on success and -errno on error. In the error case, the inode will
520 * have had make_bad_inode() executed on it.
521 */
522static int ntfs_read_locked_inode(struct inode *vi)
523{
524 ntfs_volume *vol = NTFS_SB(vi->i_sb);
525 ntfs_inode *ni;
526 MFT_RECORD *m;
527 STANDARD_INFORMATION *si;
528 ntfs_attr_search_ctx *ctx;
529 int err = 0;
530
531 ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
532
533 /* Setup the generic vfs inode parts now. */
534
535 /* This is the optimal IO size (for stat), not the fs block size. */
536 vi->i_blksize = PAGE_CACHE_SIZE;
537 /*
538 * This is for checking whether an inode has changed w.r.t. a file so
539 * that the file can be updated if necessary (compare with f_version).
540 */
541 vi->i_version = 1;
542
543 vi->i_uid = vol->uid;
544 vi->i_gid = vol->gid;
545 vi->i_mode = 0;
546
547 /*
548 * Initialize the ntfs specific part of @vi special casing
549 * FILE_MFT which we need to do at mount time.
550 */
551 if (vi->i_ino != FILE_MFT)
552 ntfs_init_big_inode(vi);
553 ni = NTFS_I(vi);
554
555 m = map_mft_record(ni);
556 if (IS_ERR(m)) {
557 err = PTR_ERR(m);
558 goto err_out;
559 }
560 ctx = ntfs_attr_get_search_ctx(ni, m);
561 if (!ctx) {
562 err = -ENOMEM;
563 goto unm_err_out;
564 }
565
566 if (!(m->flags & MFT_RECORD_IN_USE)) {
567 ntfs_error(vi->i_sb, "Inode is not in use!");
568 goto unm_err_out;
569 }
570 if (m->base_mft_record) {
571 ntfs_error(vi->i_sb, "Inode is an extent inode!");
572 goto unm_err_out;
573 }
574
575 /* Transfer information from mft record into vfs and ntfs inodes. */
576 vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
577
578 /*
579 * FIXME: Keep in mind that link_count is two for files which have both
580 * a long file name and a short file name as separate entries, so if
581 * we are hiding short file names this will be too high. Either we need
582 * to account for the short file names by subtracting them or we need
583 * to make sure we delete files even though i_nlink is not zero which
584 * might be tricky due to vfs interactions. Need to think about this
585 * some more when implementing the unlink command.
586 */
587 vi->i_nlink = le16_to_cpu(m->link_count);
588 /*
589 * FIXME: Reparse points can have the directory bit set even though
590 * they would be S_IFLNK. Need to deal with this further below when we
591 * implement reparse points / symbolic links but it will do for now.
592 * Also if not a directory, it could be something else, rather than
593 * a regular file. But again, will do for now.
594 */
595 /* Everyone gets all permissions. */
596 vi->i_mode |= S_IRWXUGO;
597 /* If read-only, noone gets write permissions. */
598 if (IS_RDONLY(vi))
599 vi->i_mode &= ~S_IWUGO;
600 if (m->flags & MFT_RECORD_IS_DIRECTORY) {
601 vi->i_mode |= S_IFDIR;
602 /*
603 * Apply the directory permissions mask set in the mount
604 * options.
605 */
606 vi->i_mode &= ~vol->dmask;
607 /* Things break without this kludge! */
608 if (vi->i_nlink > 1)
609 vi->i_nlink = 1;
610 } else {
611 vi->i_mode |= S_IFREG;
612 /* Apply the file permissions mask set in the mount options. */
613 vi->i_mode &= ~vol->fmask;
614 }
615 /*
616 * Find the standard information attribute in the mft record. At this
617 * stage we haven't setup the attribute list stuff yet, so this could
618 * in fact fail if the standard information is in an extent record, but
619 * I don't think this actually ever happens.
620 */
621 err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, 0, 0, NULL, 0,
622 ctx);
623 if (unlikely(err)) {
624 if (err == -ENOENT) {
625 /*
626 * TODO: We should be performing a hot fix here (if the
627 * recover mount option is set) by creating a new
628 * attribute.
629 */
630 ntfs_error(vi->i_sb, "$STANDARD_INFORMATION attribute "
631 "is missing.");
632 }
633 goto unm_err_out;
634 }
635 /* Get the standard information attribute value. */
636 si = (STANDARD_INFORMATION*)((char*)ctx->attr +
637 le16_to_cpu(ctx->attr->data.resident.value_offset));
638
639 /* Transfer information from the standard information into vi. */
640 /*
641 * Note: The i_?times do not quite map perfectly onto the NTFS times,
642 * but they are close enough, and in the end it doesn't really matter
643 * that much...
644 */
645 /*
646 * mtime is the last change of the data within the file. Not changed
647 * when only metadata is changed, e.g. a rename doesn't affect mtime.
648 */
649 vi->i_mtime = ntfs2utc(si->last_data_change_time);
650 /*
651 * ctime is the last change of the metadata of the file. This obviously
652 * always changes, when mtime is changed. ctime can be changed on its
653 * own, mtime is then not changed, e.g. when a file is renamed.
654 */
655 vi->i_ctime = ntfs2utc(si->last_mft_change_time);
656 /*
657 * Last access to the data within the file. Not changed during a rename
658 * for example but changed whenever the file is written to.
659 */
660 vi->i_atime = ntfs2utc(si->last_access_time);
661
662 /* Find the attribute list attribute if present. */
663 ntfs_attr_reinit_search_ctx(ctx);
664 err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
665 if (err) {
666 if (unlikely(err != -ENOENT)) {
667 ntfs_error(vi->i_sb, "Failed to lookup attribute list "
668 "attribute.");
669 goto unm_err_out;
670 }
671 } else /* if (!err) */ {
672 if (vi->i_ino == FILE_MFT)
673 goto skip_attr_list_load;
674 ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino);
675 NInoSetAttrList(ni);
676 if (ctx->attr->flags & ATTR_IS_ENCRYPTED ||
677 ctx->attr->flags & ATTR_COMPRESSION_MASK ||
678 ctx->attr->flags & ATTR_IS_SPARSE) {
679 ntfs_error(vi->i_sb, "Attribute list attribute is "
680 "compressed/encrypted/sparse.");
681 goto unm_err_out;
682 }
683 /* Now allocate memory for the attribute list. */
684 ni->attr_list_size = (u32)ntfs_attr_size(ctx->attr);
685 ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
686 if (!ni->attr_list) {
687 ntfs_error(vi->i_sb, "Not enough memory to allocate "
688 "buffer for attribute list.");
689 err = -ENOMEM;
690 goto unm_err_out;
691 }
692 if (ctx->attr->non_resident) {
693 NInoSetAttrListNonResident(ni);
694 if (ctx->attr->data.non_resident.lowest_vcn) {
695 ntfs_error(vi->i_sb, "Attribute list has non "
696 "zero lowest_vcn.");
697 goto unm_err_out;
698 }
699 /*
700 * Setup the runlist. No need for locking as we have
701 * exclusive access to the inode at this time.
702 */
703 ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
704 ctx->attr, NULL);
705 if (IS_ERR(ni->attr_list_rl.rl)) {
706 err = PTR_ERR(ni->attr_list_rl.rl);
707 ni->attr_list_rl.rl = NULL;
708 ntfs_error(vi->i_sb, "Mapping pairs "
709 "decompression failed.");
710 goto unm_err_out;
711 }
712 /* Now load the attribute list. */
713 if ((err = load_attribute_list(vol, &ni->attr_list_rl,
714 ni->attr_list, ni->attr_list_size,
715 sle64_to_cpu(ctx->attr->data.
716 non_resident.initialized_size)))) {
717 ntfs_error(vi->i_sb, "Failed to load "
718 "attribute list attribute.");
719 goto unm_err_out;
720 }
721 } else /* if (!ctx.attr->non_resident) */ {
722 if ((u8*)ctx->attr + le16_to_cpu(
723 ctx->attr->data.resident.value_offset) +
724 le32_to_cpu(
725 ctx->attr->data.resident.value_length) >
726 (u8*)ctx->mrec + vol->mft_record_size) {
727 ntfs_error(vi->i_sb, "Corrupt attribute list "
728 "in inode.");
729 goto unm_err_out;
730 }
731 /* Now copy the attribute list. */
732 memcpy(ni->attr_list, (u8*)ctx->attr + le16_to_cpu(
733 ctx->attr->data.resident.value_offset),
734 le32_to_cpu(
735 ctx->attr->data.resident.value_length));
736 }
737 }
738skip_attr_list_load:
739 /*
740 * If an attribute list is present we now have the attribute list value
741 * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes.
742 */
743 if (S_ISDIR(vi->i_mode)) {
744 struct inode *bvi;
745 ntfs_inode *bni;
746 INDEX_ROOT *ir;
747 char *ir_end, *index_end;
748
749 /* It is a directory, find index root attribute. */
750 ntfs_attr_reinit_search_ctx(ctx);
751 err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE,
752 0, NULL, 0, ctx);
753 if (unlikely(err)) {
754 if (err == -ENOENT) {
755 // FIXME: File is corrupt! Hot-fix with empty
756 // index root attribute if recovery option is
757 // set.
758 ntfs_error(vi->i_sb, "$INDEX_ROOT attribute "
759 "is missing.");
760 }
761 goto unm_err_out;
762 }
763 /* Set up the state. */
764 if (unlikely(ctx->attr->non_resident)) {
765 ntfs_error(vol->sb, "$INDEX_ROOT attribute is not "
766 "resident.");
767 goto unm_err_out;
768 }
769 /* Ensure the attribute name is placed before the value. */
770 if (unlikely(ctx->attr->name_length &&
771 (le16_to_cpu(ctx->attr->name_offset) >=
772 le16_to_cpu(ctx->attr->data.resident.
773 value_offset)))) {
774 ntfs_error(vol->sb, "$INDEX_ROOT attribute name is "
775 "placed after the attribute value.");
776 goto unm_err_out;
777 }
778 /*
779 * Compressed/encrypted index root just means that the newly
780 * created files in that directory should be created compressed/
781 * encrypted. However index root cannot be both compressed and
782 * encrypted.
783 */
784 if (ctx->attr->flags & ATTR_COMPRESSION_MASK)
785 NInoSetCompressed(ni);
786 if (ctx->attr->flags & ATTR_IS_ENCRYPTED) {
787 if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
788 ntfs_error(vi->i_sb, "Found encrypted and "
789 "compressed attribute.");
790 goto unm_err_out;
791 }
792 NInoSetEncrypted(ni);
793 }
794 if (ctx->attr->flags & ATTR_IS_SPARSE)
795 NInoSetSparse(ni);
796 ir = (INDEX_ROOT*)((char*)ctx->attr + le16_to_cpu(
797 ctx->attr->data.resident.value_offset));
798 ir_end = (char*)ir + le32_to_cpu(
799 ctx->attr->data.resident.value_length);
800 if (ir_end > (char*)ctx->mrec + vol->mft_record_size) {
801 ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
802 "corrupt.");
803 goto unm_err_out;
804 }
805 index_end = (char*)&ir->index +
806 le32_to_cpu(ir->index.index_length);
807 if (index_end > ir_end) {
808 ntfs_error(vi->i_sb, "Directory index is corrupt.");
809 goto unm_err_out;
810 }
811 if (ir->type != AT_FILE_NAME) {
812 ntfs_error(vi->i_sb, "Indexed attribute is not "
813 "$FILE_NAME.");
814 goto unm_err_out;
815 }
816 if (ir->collation_rule != COLLATION_FILE_NAME) {
817 ntfs_error(vi->i_sb, "Index collation rule is not "
818 "COLLATION_FILE_NAME.");
819 goto unm_err_out;
820 }
821 ni->itype.index.collation_rule = ir->collation_rule;
822 ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
823 if (ni->itype.index.block_size &
824 (ni->itype.index.block_size - 1)) {
825 ntfs_error(vi->i_sb, "Index block size (%u) is not a "
826 "power of two.",
827 ni->itype.index.block_size);
828 goto unm_err_out;
829 }
830 if (ni->itype.index.block_size > PAGE_CACHE_SIZE) {
831 ntfs_error(vi->i_sb, "Index block size (%u) > "
832 "PAGE_CACHE_SIZE (%ld) is not "
833 "supported. Sorry.",
834 ni->itype.index.block_size,
835 PAGE_CACHE_SIZE);
836 err = -EOPNOTSUPP;
837 goto unm_err_out;
838 }
839 if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
840 ntfs_error(vi->i_sb, "Index block size (%u) < "
841 "NTFS_BLOCK_SIZE (%i) is not "
842 "supported. Sorry.",
843 ni->itype.index.block_size,
844 NTFS_BLOCK_SIZE);
845 err = -EOPNOTSUPP;
846 goto unm_err_out;
847 }
848 ni->itype.index.block_size_bits =
849 ffs(ni->itype.index.block_size) - 1;
850 /* Determine the size of a vcn in the directory index. */
851 if (vol->cluster_size <= ni->itype.index.block_size) {
852 ni->itype.index.vcn_size = vol->cluster_size;
853 ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
854 } else {
855 ni->itype.index.vcn_size = vol->sector_size;
856 ni->itype.index.vcn_size_bits = vol->sector_size_bits;
857 }
858
859 /* Setup the index allocation attribute, even if not present. */
860 NInoSetMstProtected(ni);
861 ni->type = AT_INDEX_ALLOCATION;
862 ni->name = I30;
863 ni->name_len = 4;
864
865 if (!(ir->index.flags & LARGE_INDEX)) {
866 /* No index allocation. */
867 vi->i_size = ni->initialized_size =
868 ni->allocated_size = 0;
869 /* We are done with the mft record, so we release it. */
870 ntfs_attr_put_search_ctx(ctx);
871 unmap_mft_record(ni);
872 m = NULL;
873 ctx = NULL;
874 goto skip_large_dir_stuff;
875 } /* LARGE_INDEX: Index allocation present. Setup state. */
876 NInoSetIndexAllocPresent(ni);
877 /* Find index allocation attribute. */
878 ntfs_attr_reinit_search_ctx(ctx);
879 err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, I30, 4,
880 CASE_SENSITIVE, 0, NULL, 0, ctx);
881 if (unlikely(err)) {
882 if (err == -ENOENT)
883 ntfs_error(vi->i_sb, "$INDEX_ALLOCATION "
884 "attribute is not present but "
885 "$INDEX_ROOT indicated it is.");
886 else
887 ntfs_error(vi->i_sb, "Failed to lookup "
888 "$INDEX_ALLOCATION "
889 "attribute.");
890 goto unm_err_out;
891 }
892 if (!ctx->attr->non_resident) {
893 ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
894 "is resident.");
895 goto unm_err_out;
896 }
897 /*
898 * Ensure the attribute name is placed before the mapping pairs
899 * array.
900 */
901 if (unlikely(ctx->attr->name_length &&
902 (le16_to_cpu(ctx->attr->name_offset) >=
903 le16_to_cpu(ctx->attr->data.non_resident.
904 mapping_pairs_offset)))) {
905 ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name "
906 "is placed after the mapping pairs "
907 "array.");
908 goto unm_err_out;
909 }
910 if (ctx->attr->flags & ATTR_IS_ENCRYPTED) {
911 ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
912 "is encrypted.");
913 goto unm_err_out;
914 }
915 if (ctx->attr->flags & ATTR_IS_SPARSE) {
916 ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
917 "is sparse.");
918 goto unm_err_out;
919 }
920 if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
921 ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
922 "is compressed.");
923 goto unm_err_out;
924 }
925 if (ctx->attr->data.non_resident.lowest_vcn) {
926 ntfs_error(vi->i_sb, "First extent of "
927 "$INDEX_ALLOCATION attribute has non "
928 "zero lowest_vcn.");
929 goto unm_err_out;
930 }
931 vi->i_size = sle64_to_cpu(
932 ctx->attr->data.non_resident.data_size);
933 ni->initialized_size = sle64_to_cpu(
934 ctx->attr->data.non_resident.initialized_size);
935 ni->allocated_size = sle64_to_cpu(
936 ctx->attr->data.non_resident.allocated_size);
937 /*
938 * We are done with the mft record, so we release it. Otherwise
939 * we would deadlock in ntfs_attr_iget().
940 */
941 ntfs_attr_put_search_ctx(ctx);
942 unmap_mft_record(ni);
943 m = NULL;
944 ctx = NULL;
945 /* Get the index bitmap attribute inode. */
946 bvi = ntfs_attr_iget(vi, AT_BITMAP, I30, 4);
947 if (IS_ERR(bvi)) {
948 ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
949 err = PTR_ERR(bvi);
950 goto unm_err_out;
951 }
952 ni->itype.index.bmp_ino = bvi;
953 bni = NTFS_I(bvi);
954 if (NInoCompressed(bni) || NInoEncrypted(bni) ||
955 NInoSparse(bni)) {
956 ntfs_error(vi->i_sb, "$BITMAP attribute is compressed "
957 "and/or encrypted and/or sparse.");
958 goto unm_err_out;
959 }
960 /* Consistency check bitmap size vs. index allocation size. */
961 if ((bvi->i_size << 3) < (vi->i_size >>
962 ni->itype.index.block_size_bits)) {
963 ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) "
964 "for index allocation (0x%llx).",
965 bvi->i_size << 3, vi->i_size);
966 goto unm_err_out;
967 }
968skip_large_dir_stuff:
969 /* Setup the operations for this inode. */
970 vi->i_op = &ntfs_dir_inode_ops;
971 vi->i_fop = &ntfs_dir_ops;
972 } else {
973 /* It is a file. */
974 ntfs_attr_reinit_search_ctx(ctx);
975
976 /* Setup the data attribute, even if not present. */
977 ni->type = AT_DATA;
978 ni->name = NULL;
979 ni->name_len = 0;
980
981 /* Find first extent of the unnamed data attribute. */
982 err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, 0, NULL, 0, ctx);
983 if (unlikely(err)) {
984 vi->i_size = ni->initialized_size =
985 ni->allocated_size = 0;
986 if (err != -ENOENT) {
987 ntfs_error(vi->i_sb, "Failed to lookup $DATA "
988 "attribute.");
989 goto unm_err_out;
990 }
991 /*
992 * FILE_Secure does not have an unnamed $DATA
993 * attribute, so we special case it here.
994 */
995 if (vi->i_ino == FILE_Secure)
996 goto no_data_attr_special_case;
997 /*
998 * Most if not all the system files in the $Extend
999 * system directory do not have unnamed data
1000 * attributes so we need to check if the parent
1001 * directory of the file is FILE_Extend and if it is
1002 * ignore this error. To do this we need to get the
1003 * name of this inode from the mft record as the name
1004 * contains the back reference to the parent directory.
1005 */
1006 if (ntfs_is_extended_system_file(ctx) > 0)
1007 goto no_data_attr_special_case;
1008 // FIXME: File is corrupt! Hot-fix with empty data
1009 // attribute if recovery option is set.
1010 ntfs_error(vi->i_sb, "$DATA attribute is missing.");
1011 goto unm_err_out;
1012 }
1013 /* Setup the state. */
1014 if (ctx->attr->non_resident) {
1015 NInoSetNonResident(ni);
1016 if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
1017 NInoSetCompressed(ni);
1018 if (vol->cluster_size > 4096) {
1019 ntfs_error(vi->i_sb, "Found "
1020 "compressed data but "
1021 "compression is disabled due "
1022 "to cluster size (%i) > 4kiB.",
1023 vol->cluster_size);
1024 goto unm_err_out;
1025 }
1026 if ((ctx->attr->flags & ATTR_COMPRESSION_MASK)
1027 != ATTR_IS_COMPRESSED) {
1028 ntfs_error(vi->i_sb, "Found "
1029 "unknown compression method or "
1030 "corrupt file.");
1031 goto unm_err_out;
1032 }
1033 ni->itype.compressed.block_clusters = 1U <<
1034 ctx->attr->data.non_resident.
1035 compression_unit;
1036 if (ctx->attr->data.non_resident.
1037 compression_unit != 4) {
1038 ntfs_error(vi->i_sb, "Found "
1039 "nonstandard compression unit "
1040 "(%u instead of 4). Cannot "
1041 "handle this.",
1042 ctx->attr->data.non_resident.
1043 compression_unit);
1044 err = -EOPNOTSUPP;
1045 goto unm_err_out;
1046 }
1047 ni->itype.compressed.block_size = 1U << (
1048 ctx->attr->data.non_resident.
1049 compression_unit +
1050 vol->cluster_size_bits);
1051 ni->itype.compressed.block_size_bits = ffs(
1052 ni->itype.compressed.block_size) - 1;
1053 }
1054 if (ctx->attr->flags & ATTR_IS_ENCRYPTED) {
1055 if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
1056 ntfs_error(vi->i_sb, "Found encrypted "
1057 "and compressed data.");
1058 goto unm_err_out;
1059 }
1060 NInoSetEncrypted(ni);
1061 }
1062 if (ctx->attr->flags & ATTR_IS_SPARSE)
1063 NInoSetSparse(ni);
1064 if (ctx->attr->data.non_resident.lowest_vcn) {
1065 ntfs_error(vi->i_sb, "First extent of $DATA "
1066 "attribute has non zero "
1067 "lowest_vcn.");
1068 goto unm_err_out;
1069 }
1070 /* Setup all the sizes. */
1071 vi->i_size = sle64_to_cpu(
1072 ctx->attr->data.non_resident.data_size);
1073 ni->initialized_size = sle64_to_cpu(
1074 ctx->attr->data.non_resident.
1075 initialized_size);
1076 ni->allocated_size = sle64_to_cpu(
1077 ctx->attr->data.non_resident.
1078 allocated_size);
1079 if (NInoCompressed(ni)) {
1080 ni->itype.compressed.size = sle64_to_cpu(
1081 ctx->attr->data.non_resident.
1082 compressed_size);
1083 }
1084 } else { /* Resident attribute. */
1085 /*
1086 * Make all sizes equal for simplicity in read code
1087 * paths. FIXME: Need to keep this in mind when
1088 * converting to non-resident attribute in write code
1089 * path. (Probably only affects truncate().)
1090 */
1091 vi->i_size = ni->initialized_size = ni->allocated_size =
1092 le32_to_cpu(
1093 ctx->attr->data.resident.value_length);
1094 }
1095no_data_attr_special_case:
1096 /* We are done with the mft record, so we release it. */
1097 ntfs_attr_put_search_ctx(ctx);
1098 unmap_mft_record(ni);
1099 m = NULL;
1100 ctx = NULL;
1101 /* Setup the operations for this inode. */
1102 vi->i_op = &ntfs_file_inode_ops;
1103 vi->i_fop = &ntfs_file_ops;
1104 }
1105 if (NInoMstProtected(ni))
1106 vi->i_mapping->a_ops = &ntfs_mst_aops;
1107 else
1108 vi->i_mapping->a_ops = &ntfs_aops;
1109 /*
1110 * The number of 512-byte blocks used on disk (for stat). This is in so
1111 * far inaccurate as it doesn't account for any named streams or other
1112 * special non-resident attributes, but that is how Windows works, too,
1113 * so we are at least consistent with Windows, if not entirely
1114 * consistent with the Linux Way. Doing it the Linux Way would cause a
1115 * significant slowdown as it would involve iterating over all
1116 * attributes in the mft record and adding the allocated/compressed
1117 * sizes of all non-resident attributes present to give us the Linux
1118 * correct size that should go into i_blocks (after division by 512).
1119 */
1120 if (S_ISDIR(vi->i_mode) || !NInoCompressed(ni))
1121 vi->i_blocks = ni->allocated_size >> 9;
1122 else
1123 vi->i_blocks = ni->itype.compressed.size >> 9;
1124
1125 ntfs_debug("Done.");
1126 return 0;
1127
1128unm_err_out:
1129 if (!err)
1130 err = -EIO;
1131 if (ctx)
1132 ntfs_attr_put_search_ctx(ctx);
1133 if (m)
1134 unmap_mft_record(ni);
1135err_out:
1136 ntfs_error(vol->sb, "Failed with error code %i. Marking corrupt "
1137 "inode 0x%lx as bad. Run chkdsk.", err, vi->i_ino);
1138 make_bad_inode(vi);
1139 if (err != -EOPNOTSUPP && err != -ENOMEM)
1140 NVolSetErrors(vol);
1141 return err;
1142}
1143
1144/**
1145 * ntfs_read_locked_attr_inode - read an attribute inode from its base inode
1146 * @base_vi: base inode
1147 * @vi: attribute inode to read
1148 *
1149 * ntfs_read_locked_attr_inode() is called from ntfs_attr_iget() to read the
1150 * attribute inode described by @vi into memory from the base mft record
1151 * described by @base_ni.
1152 *
1153 * ntfs_read_locked_attr_inode() maps, pins and locks the base inode for
1154 * reading and looks up the attribute described by @vi before setting up the
1155 * necessary fields in @vi as well as initializing the ntfs inode.
1156 *
1157 * Q: What locks are held when the function is called?
1158 * A: i_state has I_LOCK set, hence the inode is locked, also
1159 * i_count is set to 1, so it is not going to go away
1160 *
1161 * Return 0 on success and -errno on error. In the error case, the inode will
1162 * have had make_bad_inode() executed on it.
1163 */
1164static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
1165{
1166 ntfs_volume *vol = NTFS_SB(vi->i_sb);
1167 ntfs_inode *ni, *base_ni;
1168 MFT_RECORD *m;
1169 ntfs_attr_search_ctx *ctx;
1170 int err = 0;
1171
1172 ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
1173
1174 ntfs_init_big_inode(vi);
1175
1176 ni = NTFS_I(vi);
1177 base_ni = NTFS_I(base_vi);
1178
1179 /* Just mirror the values from the base inode. */
1180 vi->i_blksize = base_vi->i_blksize;
1181 vi->i_version = base_vi->i_version;
1182 vi->i_uid = base_vi->i_uid;
1183 vi->i_gid = base_vi->i_gid;
1184 vi->i_nlink = base_vi->i_nlink;
1185 vi->i_mtime = base_vi->i_mtime;
1186 vi->i_ctime = base_vi->i_ctime;
1187 vi->i_atime = base_vi->i_atime;
1188 vi->i_generation = ni->seq_no = base_ni->seq_no;
1189
1190 /* Set inode type to zero but preserve permissions. */
1191 vi->i_mode = base_vi->i_mode & ~S_IFMT;
1192
1193 m = map_mft_record(base_ni);
1194 if (IS_ERR(m)) {
1195 err = PTR_ERR(m);
1196 goto err_out;
1197 }
1198 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1199 if (!ctx) {
1200 err = -ENOMEM;
1201 goto unm_err_out;
1202 }
1203
1204 /* Find the attribute. */
1205 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1206 CASE_SENSITIVE, 0, NULL, 0, ctx);
1207 if (unlikely(err))
1208 goto unm_err_out;
1209
1210 if (!ctx->attr->non_resident) {
1211 /* Ensure the attribute name is placed before the value. */
1212 if (unlikely(ctx->attr->name_length &&
1213 (le16_to_cpu(ctx->attr->name_offset) >=
1214 le16_to_cpu(ctx->attr->data.resident.
1215 value_offset)))) {
1216 ntfs_error(vol->sb, "Attribute name is placed after "
1217 "the attribute value.");
1218 goto unm_err_out;
1219 }
1220 if (NInoMstProtected(ni) || ctx->attr->flags) {
1221 ntfs_error(vi->i_sb, "Found mst protected attribute "
1222 "or attribute with non-zero flags but "
1223 "the attribute is resident. Please "
1224 "report you saw this message to "
1225 "linux-ntfs-dev@lists.sourceforge.net");
1226 goto unm_err_out;
1227 }
1228 /*
1229 * Resident attribute. Make all sizes equal for simplicity in
1230 * read code paths.
1231 */
1232 vi->i_size = ni->initialized_size = ni->allocated_size =
1233 le32_to_cpu(ctx->attr->data.resident.value_length);
1234 } else {
1235 NInoSetNonResident(ni);
1236 /*
1237 * Ensure the attribute name is placed before the mapping pairs
1238 * array.
1239 */
1240 if (unlikely(ctx->attr->name_length &&
1241 (le16_to_cpu(ctx->attr->name_offset) >=
1242 le16_to_cpu(ctx->attr->data.non_resident.
1243 mapping_pairs_offset)))) {
1244 ntfs_error(vol->sb, "Attribute name is placed after "
1245 "the mapping pairs array.");
1246 goto unm_err_out;
1247 }
1248 if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
1249 if (NInoMstProtected(ni)) {
1250 ntfs_error(vi->i_sb, "Found mst protected "
1251 "attribute but the attribute "
1252 "is compressed. Please report "
1253 "you saw this message to "
1254 "linux-ntfs-dev@lists."
1255 "sourceforge.net");
1256 goto unm_err_out;
1257 }
1258 NInoSetCompressed(ni);
1259 if ((ni->type != AT_DATA) || (ni->type == AT_DATA &&
1260 ni->name_len)) {
1261 ntfs_error(vi->i_sb, "Found compressed "
1262 "non-data or named data "
1263 "attribute. Please report "
1264 "you saw this message to "
1265 "linux-ntfs-dev@lists."
1266 "sourceforge.net");
1267 goto unm_err_out;
1268 }
1269 if (vol->cluster_size > 4096) {
1270 ntfs_error(vi->i_sb, "Found compressed "
1271 "attribute but compression is "
1272 "disabled due to cluster size "
1273 "(%i) > 4kiB.",
1274 vol->cluster_size);
1275 goto unm_err_out;
1276 }
1277 if ((ctx->attr->flags & ATTR_COMPRESSION_MASK)
1278 != ATTR_IS_COMPRESSED) {
1279 ntfs_error(vi->i_sb, "Found unknown "
1280 "compression method.");
1281 goto unm_err_out;
1282 }
1283 ni->itype.compressed.block_clusters = 1U <<
1284 ctx->attr->data.non_resident.
1285 compression_unit;
1286 if (ctx->attr->data.non_resident.compression_unit !=
1287 4) {
1288 ntfs_error(vi->i_sb, "Found nonstandard "
1289 "compression unit (%u instead "
1290 "of 4). Cannot handle this.",
1291 ctx->attr->data.non_resident.
1292 compression_unit);
1293 err = -EOPNOTSUPP;
1294 goto unm_err_out;
1295 }
1296 ni->itype.compressed.block_size = 1U << (
1297 ctx->attr->data.non_resident.
1298 compression_unit +
1299 vol->cluster_size_bits);
1300 ni->itype.compressed.block_size_bits = ffs(
1301 ni->itype.compressed.block_size) - 1;
1302 }
1303 if (ctx->attr->flags & ATTR_IS_ENCRYPTED) {
1304 if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
1305 ntfs_error(vi->i_sb, "Found encrypted "
1306 "and compressed data.");
1307 goto unm_err_out;
1308 }
1309 if (NInoMstProtected(ni)) {
1310 ntfs_error(vi->i_sb, "Found mst protected "
1311 "attribute but the attribute "
1312 "is encrypted. Please report "
1313 "you saw this message to "
1314 "linux-ntfs-dev@lists."
1315 "sourceforge.net");
1316 goto unm_err_out;
1317 }
1318 NInoSetEncrypted(ni);
1319 }
1320 if (ctx->attr->flags & ATTR_IS_SPARSE) {
1321 if (NInoMstProtected(ni)) {
1322 ntfs_error(vi->i_sb, "Found mst protected "
1323 "attribute but the attribute "
1324 "is sparse. Please report "
1325 "you saw this message to "
1326 "linux-ntfs-dev@lists."
1327 "sourceforge.net");
1328 goto unm_err_out;
1329 }
1330 NInoSetSparse(ni);
1331 }
1332 if (ctx->attr->data.non_resident.lowest_vcn) {
1333 ntfs_error(vi->i_sb, "First extent of attribute has "
1334 "non-zero lowest_vcn.");
1335 goto unm_err_out;
1336 }
1337 /* Setup all the sizes. */
1338 vi->i_size = sle64_to_cpu(
1339 ctx->attr->data.non_resident.data_size);
1340 ni->initialized_size = sle64_to_cpu(
1341 ctx->attr->data.non_resident.initialized_size);
1342 ni->allocated_size = sle64_to_cpu(
1343 ctx->attr->data.non_resident.allocated_size);
1344 if (NInoCompressed(ni)) {
1345 ni->itype.compressed.size = sle64_to_cpu(
1346 ctx->attr->data.non_resident.
1347 compressed_size);
1348 }
1349 }
1350
1351 /* Setup the operations for this attribute inode. */
1352 vi->i_op = NULL;
1353 vi->i_fop = NULL;
1354 if (NInoMstProtected(ni))
1355 vi->i_mapping->a_ops = &ntfs_mst_aops;
1356 else
1357 vi->i_mapping->a_ops = &ntfs_aops;
1358
1359 if (!NInoCompressed(ni))
1360 vi->i_blocks = ni->allocated_size >> 9;
1361 else
1362 vi->i_blocks = ni->itype.compressed.size >> 9;
1363
1364 /*
1365 * Make sure the base inode doesn't go away and attach it to the
1366 * attribute inode.
1367 */
1368 igrab(base_vi);
1369 ni->ext.base_ntfs_ino = base_ni;
1370 ni->nr_extents = -1;
1371
1372 ntfs_attr_put_search_ctx(ctx);
1373 unmap_mft_record(base_ni);
1374
1375 ntfs_debug("Done.");
1376 return 0;
1377
1378unm_err_out:
1379 if (!err)
1380 err = -EIO;
1381 if (ctx)
1382 ntfs_attr_put_search_ctx(ctx);
1383 unmap_mft_record(base_ni);
1384err_out:
1385 ntfs_error(vol->sb, "Failed with error code %i while reading attribute "
1386 "inode (mft_no 0x%lx, type 0x%x, name_len %i). "
1387 "Marking corrupt inode and base inode 0x%lx as bad. "
1388 "Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len,
1389 base_vi->i_ino);
1390 make_bad_inode(vi);
1391 make_bad_inode(base_vi);
1392 if (err != -ENOMEM)
1393 NVolSetErrors(vol);
1394 return err;
1395}
1396
1397/**
1398 * ntfs_read_locked_index_inode - read an index inode from its base inode
1399 * @base_vi: base inode
1400 * @vi: index inode to read
1401 *
1402 * ntfs_read_locked_index_inode() is called from ntfs_index_iget() to read the
1403 * index inode described by @vi into memory from the base mft record described
1404 * by @base_ni.
1405 *
1406 * ntfs_read_locked_index_inode() maps, pins and locks the base inode for
1407 * reading and looks up the attributes relating to the index described by @vi
1408 * before setting up the necessary fields in @vi as well as initializing the
1409 * ntfs inode.
1410 *
1411 * Note, index inodes are essentially attribute inodes (NInoAttr() is true)
1412 * with the attribute type set to AT_INDEX_ALLOCATION. Apart from that, they
1413 * are setup like directory inodes since directories are a special case of
1414 * indices ao they need to be treated in much the same way. Most importantly,
1415 * for small indices the index allocation attribute might not actually exist.
1416 * However, the index root attribute always exists but this does not need to
1417 * have an inode associated with it and this is why we define a new inode type
1418 * index. Also, like for directories, we need to have an attribute inode for
1419 * the bitmap attribute corresponding to the index allocation attribute and we
1420 * can store this in the appropriate field of the inode, just like we do for
1421 * normal directory inodes.
1422 *
1423 * Q: What locks are held when the function is called?
1424 * A: i_state has I_LOCK set, hence the inode is locked, also
1425 * i_count is set to 1, so it is not going to go away
1426 *
1427 * Return 0 on success and -errno on error. In the error case, the inode will
1428 * have had make_bad_inode() executed on it.
1429 */
1430static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
1431{
1432 ntfs_volume *vol = NTFS_SB(vi->i_sb);
1433 ntfs_inode *ni, *base_ni, *bni;
1434 struct inode *bvi;
1435 MFT_RECORD *m;
1436 ntfs_attr_search_ctx *ctx;
1437 INDEX_ROOT *ir;
1438 u8 *ir_end, *index_end;
1439 int err = 0;
1440
1441 ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
1442 ntfs_init_big_inode(vi);
1443 ni = NTFS_I(vi);
1444 base_ni = NTFS_I(base_vi);
1445 /* Just mirror the values from the base inode. */
1446 vi->i_blksize = base_vi->i_blksize;
1447 vi->i_version = base_vi->i_version;
1448 vi->i_uid = base_vi->i_uid;
1449 vi->i_gid = base_vi->i_gid;
1450 vi->i_nlink = base_vi->i_nlink;
1451 vi->i_mtime = base_vi->i_mtime;
1452 vi->i_ctime = base_vi->i_ctime;
1453 vi->i_atime = base_vi->i_atime;
1454 vi->i_generation = ni->seq_no = base_ni->seq_no;
1455 /* Set inode type to zero but preserve permissions. */
1456 vi->i_mode = base_vi->i_mode & ~S_IFMT;
1457 /* Map the mft record for the base inode. */
1458 m = map_mft_record(base_ni);
1459 if (IS_ERR(m)) {
1460 err = PTR_ERR(m);
1461 goto err_out;
1462 }
1463 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1464 if (!ctx) {
1465 err = -ENOMEM;
1466 goto unm_err_out;
1467 }
1468 /* Find the index root attribute. */
1469 err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len,
1470 CASE_SENSITIVE, 0, NULL, 0, ctx);
1471 if (unlikely(err)) {
1472 if (err == -ENOENT)
1473 ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
1474 "missing.");
1475 goto unm_err_out;
1476 }
1477 /* Set up the state. */
1478 if (unlikely(ctx->attr->non_resident)) {
1479 ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident.");
1480 goto unm_err_out;
1481 }
1482 /* Ensure the attribute name is placed before the value. */
1483 if (unlikely(ctx->attr->name_length &&
1484 (le16_to_cpu(ctx->attr->name_offset) >=
1485 le16_to_cpu(ctx->attr->data.resident.
1486 value_offset)))) {
1487 ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed "
1488 "after the attribute value.");
1489 goto unm_err_out;
1490 }
1491 /* Compressed/encrypted/sparse index root is not allowed. */
1492 if (ctx->attr->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED |
1493 ATTR_IS_SPARSE)) {
1494 ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index "
1495 "root attribute.");
1496 goto unm_err_out;
1497 }
1498 ir = (INDEX_ROOT*)((u8*)ctx->attr +
1499 le16_to_cpu(ctx->attr->data.resident.value_offset));
1500 ir_end = (u8*)ir + le32_to_cpu(ctx->attr->data.resident.value_length);
1501 if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
1502 ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt.");
1503 goto unm_err_out;
1504 }
1505 index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
1506 if (index_end > ir_end) {
1507 ntfs_error(vi->i_sb, "Index is corrupt.");
1508 goto unm_err_out;
1509 }
1510 if (ir->type) {
1511 ntfs_error(vi->i_sb, "Index type is not 0 (type is 0x%x).",
1512 le32_to_cpu(ir->type));
1513 goto unm_err_out;
1514 }
1515 ni->itype.index.collation_rule = ir->collation_rule;
1516 ntfs_debug("Index collation rule is 0x%x.",
1517 le32_to_cpu(ir->collation_rule));
1518 ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
1519 if (ni->itype.index.block_size & (ni->itype.index.block_size - 1)) {
1520 ntfs_error(vi->i_sb, "Index block size (%u) is not a power of "
1521 "two.", ni->itype.index.block_size);
1522 goto unm_err_out;
1523 }
1524 if (ni->itype.index.block_size > PAGE_CACHE_SIZE) {
1525 ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_CACHE_SIZE "
1526 "(%ld) is not supported. Sorry.",
1527 ni->itype.index.block_size, PAGE_CACHE_SIZE);
1528 err = -EOPNOTSUPP;
1529 goto unm_err_out;
1530 }
1531 if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
1532 ntfs_error(vi->i_sb, "Index block size (%u) < NTFS_BLOCK_SIZE "
1533 "(%i) is not supported. Sorry.",
1534 ni->itype.index.block_size, NTFS_BLOCK_SIZE);
1535 err = -EOPNOTSUPP;
1536 goto unm_err_out;
1537 }
1538 ni->itype.index.block_size_bits = ffs(ni->itype.index.block_size) - 1;
1539 /* Determine the size of a vcn in the index. */
1540 if (vol->cluster_size <= ni->itype.index.block_size) {
1541 ni->itype.index.vcn_size = vol->cluster_size;
1542 ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
1543 } else {
1544 ni->itype.index.vcn_size = vol->sector_size;
1545 ni->itype.index.vcn_size_bits = vol->sector_size_bits;
1546 }
1547 /* Check for presence of index allocation attribute. */
1548 if (!(ir->index.flags & LARGE_INDEX)) {
1549 /* No index allocation. */
1550 vi->i_size = ni->initialized_size = ni->allocated_size = 0;
1551 /* We are done with the mft record, so we release it. */
1552 ntfs_attr_put_search_ctx(ctx);
1553 unmap_mft_record(base_ni);
1554 m = NULL;
1555 ctx = NULL;
1556 goto skip_large_index_stuff;
1557 } /* LARGE_INDEX: Index allocation present. Setup state. */
1558 NInoSetIndexAllocPresent(ni);
1559 /* Find index allocation attribute. */
1560 ntfs_attr_reinit_search_ctx(ctx);
1561 err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len,
1562 CASE_SENSITIVE, 0, NULL, 0, ctx);
1563 if (unlikely(err)) {
1564 if (err == -ENOENT)
1565 ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
1566 "not present but $INDEX_ROOT "
1567 "indicated it is.");
1568 else
1569 ntfs_error(vi->i_sb, "Failed to lookup "
1570 "$INDEX_ALLOCATION attribute.");
1571 goto unm_err_out;
1572 }
1573 if (!ctx->attr->non_resident) {
1574 ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
1575 "resident.");
1576 goto unm_err_out;
1577 }
1578 /*
1579 * Ensure the attribute name is placed before the mapping pairs array.
1580 */
1581 if (unlikely(ctx->attr->name_length && (le16_to_cpu(
1582 ctx->attr->name_offset) >= le16_to_cpu(
1583 ctx->attr->data.non_resident.mapping_pairs_offset)))) {
1584 ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is "
1585 "placed after the mapping pairs array.");
1586 goto unm_err_out;
1587 }
1588 if (ctx->attr->flags & ATTR_IS_ENCRYPTED) {
1589 ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
1590 "encrypted.");
1591 goto unm_err_out;
1592 }
1593 if (ctx->attr->flags & ATTR_IS_SPARSE) {
1594 ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse.");
1595 goto unm_err_out;
1596 }
1597 if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
1598 ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
1599 "compressed.");
1600 goto unm_err_out;
1601 }
1602 if (ctx->attr->data.non_resident.lowest_vcn) {
1603 ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION "
1604 "attribute has non zero lowest_vcn.");
1605 goto unm_err_out;
1606 }
1607 vi->i_size = sle64_to_cpu(ctx->attr->data.non_resident.data_size);
1608 ni->initialized_size = sle64_to_cpu(
1609 ctx->attr->data.non_resident.initialized_size);
1610 ni->allocated_size = sle64_to_cpu(
1611 ctx->attr->data.non_resident.allocated_size);
1612 /*
1613 * We are done with the mft record, so we release it. Otherwise
1614 * we would deadlock in ntfs_attr_iget().
1615 */
1616 ntfs_attr_put_search_ctx(ctx);
1617 unmap_mft_record(base_ni);
1618 m = NULL;
1619 ctx = NULL;
1620 /* Get the index bitmap attribute inode. */
1621 bvi = ntfs_attr_iget(base_vi, AT_BITMAP, ni->name, ni->name_len);
1622 if (IS_ERR(bvi)) {
1623 ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
1624 err = PTR_ERR(bvi);
1625 goto unm_err_out;
1626 }
1627 bni = NTFS_I(bvi);
1628 if (NInoCompressed(bni) || NInoEncrypted(bni) ||
1629 NInoSparse(bni)) {
1630 ntfs_error(vi->i_sb, "$BITMAP attribute is compressed and/or "
1631 "encrypted and/or sparse.");
1632 goto iput_unm_err_out;
1633 }
1634 /* Consistency check bitmap size vs. index allocation size. */
1635 if ((bvi->i_size << 3) < (vi->i_size >>
1636 ni->itype.index.block_size_bits)) {
1637 ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for "
1638 "index allocation (0x%llx).", bvi->i_size << 3,
1639 vi->i_size);
1640 goto iput_unm_err_out;
1641 }
1642 ni->itype.index.bmp_ino = bvi;
1643skip_large_index_stuff:
1644 /* Setup the operations for this index inode. */
1645 vi->i_op = NULL;
1646 vi->i_fop = NULL;
1647 vi->i_mapping->a_ops = &ntfs_mst_aops;
1648 vi->i_blocks = ni->allocated_size >> 9;
1649
1650 /*
1651 * Make sure the base inode doesn't go away and attach it to the
1652 * index inode.
1653 */
1654 igrab(base_vi);
1655 ni->ext.base_ntfs_ino = base_ni;
1656 ni->nr_extents = -1;
1657
1658 ntfs_debug("Done.");
1659 return 0;
1660
1661iput_unm_err_out:
1662 iput(bvi);
1663unm_err_out:
1664 if (!err)
1665 err = -EIO;
1666 if (ctx)
1667 ntfs_attr_put_search_ctx(ctx);
1668 if (m)
1669 unmap_mft_record(base_ni);
1670err_out:
1671 ntfs_error(vi->i_sb, "Failed with error code %i while reading index "
1672 "inode (mft_no 0x%lx, name_len %i.", err, vi->i_ino,
1673 ni->name_len);
1674 make_bad_inode(vi);
1675 if (err != -EOPNOTSUPP && err != -ENOMEM)
1676 NVolSetErrors(vol);
1677 return err;
1678}
1679
1680/**
1681 * ntfs_read_inode_mount - special read_inode for mount time use only
1682 * @vi: inode to read
1683 *
1684 * Read inode FILE_MFT at mount time, only called with super_block lock
1685 * held from within the read_super() code path.
1686 *
1687 * This function exists because when it is called the page cache for $MFT/$DATA
1688 * is not initialized and hence we cannot get at the contents of mft records
1689 * by calling map_mft_record*().
1690 *
1691 * Further it needs to cope with the circular references problem, i.e. cannot
1692 * load any attributes other than $ATTRIBUTE_LIST until $DATA is loaded, because
1693 * we do not know where the other extent mft records are yet and again, because
1694 * we cannot call map_mft_record*() yet. Obviously this applies only when an
1695 * attribute list is actually present in $MFT inode.
1696 *
1697 * We solve these problems by starting with the $DATA attribute before anything
1698 * else and iterating using ntfs_attr_lookup($DATA) over all extents. As each
1699 * extent is found, we ntfs_mapping_pairs_decompress() including the implied
1700 * ntfs_runlists_merge(). Each step of the iteration necessarily provides
1701 * sufficient information for the next step to complete.
1702 *
1703 * This should work but there are two possible pit falls (see inline comments
1704 * below), but only time will tell if they are real pits or just smoke...
1705 */
1706int ntfs_read_inode_mount(struct inode *vi)
1707{
1708 VCN next_vcn, last_vcn, highest_vcn;
1709 s64 block;
1710 struct super_block *sb = vi->i_sb;
1711 ntfs_volume *vol = NTFS_SB(sb);
1712 struct buffer_head *bh;
1713 ntfs_inode *ni;
1714 MFT_RECORD *m = NULL;
1715 ATTR_RECORD *attr;
1716 ntfs_attr_search_ctx *ctx;
1717 unsigned int i, nr_blocks;
1718 int err;
1719
1720 ntfs_debug("Entering.");
1721
1722 /* Initialize the ntfs specific part of @vi. */
1723 ntfs_init_big_inode(vi);
1724
1725 ni = NTFS_I(vi);
1726
1727 /* Setup the data attribute. It is special as it is mst protected. */
1728 NInoSetNonResident(ni);
1729 NInoSetMstProtected(ni);
1730 ni->type = AT_DATA;
1731 ni->name = NULL;
1732 ni->name_len = 0;
1733
1734 /*
1735 * This sets up our little cheat allowing us to reuse the async read io
1736 * completion handler for directories.
1737 */
1738 ni->itype.index.block_size = vol->mft_record_size;
1739 ni->itype.index.block_size_bits = vol->mft_record_size_bits;
1740
1741 /* Very important! Needed to be able to call map_mft_record*(). */
1742 vol->mft_ino = vi;
1743
1744 /* Allocate enough memory to read the first mft record. */
1745 if (vol->mft_record_size > 64 * 1024) {
1746 ntfs_error(sb, "Unsupported mft record size %i (max 64kiB).",
1747 vol->mft_record_size);
1748 goto err_out;
1749 }
1750 i = vol->mft_record_size;
1751 if (i < sb->s_blocksize)
1752 i = sb->s_blocksize;
1753 m = (MFT_RECORD*)ntfs_malloc_nofs(i);
1754 if (!m) {
1755 ntfs_error(sb, "Failed to allocate buffer for $MFT record 0.");
1756 goto err_out;
1757 }
1758
1759 /* Determine the first block of the $MFT/$DATA attribute. */
1760 block = vol->mft_lcn << vol->cluster_size_bits >>
1761 sb->s_blocksize_bits;
1762 nr_blocks = vol->mft_record_size >> sb->s_blocksize_bits;
1763 if (!nr_blocks)
1764 nr_blocks = 1;
1765
1766 /* Load $MFT/$DATA's first mft record. */
1767 for (i = 0; i < nr_blocks; i++) {
1768 bh = sb_bread(sb, block++);
1769 if (!bh) {
1770 ntfs_error(sb, "Device read failed.");
1771 goto err_out;
1772 }
1773 memcpy((char*)m + (i << sb->s_blocksize_bits), bh->b_data,
1774 sb->s_blocksize);
1775 brelse(bh);
1776 }
1777
1778 /* Apply the mst fixups. */
1779 if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) {
1780 /* FIXME: Try to use the $MFTMirr now. */
1781 ntfs_error(sb, "MST fixup failed. $MFT is corrupt.");
1782 goto err_out;
1783 }
1784
1785 /* Need this to sanity check attribute list references to $MFT. */
1786 vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
1787
1788 /* Provides readpage() and sync_page() for map_mft_record(). */
1789 vi->i_mapping->a_ops = &ntfs_mst_aops;
1790
1791 ctx = ntfs_attr_get_search_ctx(ni, m);
1792 if (!ctx) {
1793 err = -ENOMEM;
1794 goto err_out;
1795 }
1796
1797 /* Find the attribute list attribute if present. */
1798 err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
1799 if (err) {
1800 if (unlikely(err != -ENOENT)) {
1801 ntfs_error(sb, "Failed to lookup attribute list "
1802 "attribute. You should run chkdsk.");
1803 goto put_err_out;
1804 }
1805 } else /* if (!err) */ {
1806 ATTR_LIST_ENTRY *al_entry, *next_al_entry;
1807 u8 *al_end;
1808
1809 ntfs_debug("Attribute list attribute found in $MFT.");
1810 NInoSetAttrList(ni);
1811 if (ctx->attr->flags & ATTR_IS_ENCRYPTED ||
1812 ctx->attr->flags & ATTR_COMPRESSION_MASK ||
1813 ctx->attr->flags & ATTR_IS_SPARSE) {
1814 ntfs_error(sb, "Attribute list attribute is "
1815 "compressed/encrypted/sparse. Not "
1816 "allowed. $MFT is corrupt. You should "
1817 "run chkdsk.");
1818 goto put_err_out;
1819 }
1820 /* Now allocate memory for the attribute list. */
1821 ni->attr_list_size = (u32)ntfs_attr_size(ctx->attr);
1822 ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
1823 if (!ni->attr_list) {
1824 ntfs_error(sb, "Not enough memory to allocate buffer "
1825 "for attribute list.");
1826 goto put_err_out;
1827 }
1828 if (ctx->attr->non_resident) {
1829 NInoSetAttrListNonResident(ni);
1830 if (ctx->attr->data.non_resident.lowest_vcn) {
1831 ntfs_error(sb, "Attribute list has non zero "
1832 "lowest_vcn. $MFT is corrupt. "
1833 "You should run chkdsk.");
1834 goto put_err_out;
1835 }
1836 /* Setup the runlist. */
1837 ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
1838 ctx->attr, NULL);
1839 if (IS_ERR(ni->attr_list_rl.rl)) {
1840 err = PTR_ERR(ni->attr_list_rl.rl);
1841 ni->attr_list_rl.rl = NULL;
1842 ntfs_error(sb, "Mapping pairs decompression "
1843 "failed with error code %i.",
1844 -err);
1845 goto put_err_out;
1846 }
1847 /* Now load the attribute list. */
1848 if ((err = load_attribute_list(vol, &ni->attr_list_rl,
1849 ni->attr_list, ni->attr_list_size,
1850 sle64_to_cpu(ctx->attr->data.
1851 non_resident.initialized_size)))) {
1852 ntfs_error(sb, "Failed to load attribute list "
1853 "attribute with error code %i.",
1854 -err);
1855 goto put_err_out;
1856 }
1857 } else /* if (!ctx.attr->non_resident) */ {
1858 if ((u8*)ctx->attr + le16_to_cpu(
1859 ctx->attr->data.resident.value_offset) +
1860 le32_to_cpu(
1861 ctx->attr->data.resident.value_length) >
1862 (u8*)ctx->mrec + vol->mft_record_size) {
1863 ntfs_error(sb, "Corrupt attribute list "
1864 "attribute.");
1865 goto put_err_out;
1866 }
1867 /* Now copy the attribute list. */
1868 memcpy(ni->attr_list, (u8*)ctx->attr + le16_to_cpu(
1869 ctx->attr->data.resident.value_offset),
1870 le32_to_cpu(
1871 ctx->attr->data.resident.value_length));
1872 }
1873 /* The attribute list is now setup in memory. */
1874 /*
1875 * FIXME: I don't know if this case is actually possible.
1876 * According to logic it is not possible but I have seen too
1877 * many weird things in MS software to rely on logic... Thus we
1878 * perform a manual search and make sure the first $MFT/$DATA
1879 * extent is in the base inode. If it is not we abort with an
1880 * error and if we ever see a report of this error we will need
1881 * to do some magic in order to have the necessary mft record
1882 * loaded and in the right place in the page cache. But
1883 * hopefully logic will prevail and this never happens...
1884 */
1885 al_entry = (ATTR_LIST_ENTRY*)ni->attr_list;
1886 al_end = (u8*)al_entry + ni->attr_list_size;
1887 for (;; al_entry = next_al_entry) {
1888 /* Out of bounds check. */
1889 if ((u8*)al_entry < ni->attr_list ||
1890 (u8*)al_entry > al_end)
1891 goto em_put_err_out;
1892 /* Catch the end of the attribute list. */
1893 if ((u8*)al_entry == al_end)
1894 goto em_put_err_out;
1895 if (!al_entry->length)
1896 goto em_put_err_out;
1897 if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
1898 le16_to_cpu(al_entry->length) > al_end)
1899 goto em_put_err_out;
1900 next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
1901 le16_to_cpu(al_entry->length));
1902 if (le32_to_cpu(al_entry->type) >
1903 const_le32_to_cpu(AT_DATA))
1904 goto em_put_err_out;
1905 if (AT_DATA != al_entry->type)
1906 continue;
1907 /* We want an unnamed attribute. */
1908 if (al_entry->name_length)
1909 goto em_put_err_out;
1910 /* Want the first entry, i.e. lowest_vcn == 0. */
1911 if (al_entry->lowest_vcn)
1912 goto em_put_err_out;
1913 /* First entry has to be in the base mft record. */
1914 if (MREF_LE(al_entry->mft_reference) != vi->i_ino) {
1915 /* MFT references do not match, logic fails. */
1916 ntfs_error(sb, "BUG: The first $DATA extent "
1917 "of $MFT is not in the base "
1918 "mft record. Please report "
1919 "you saw this message to "
1920 "linux-ntfs-dev@lists."
1921 "sourceforge.net");
1922 goto put_err_out;
1923 } else {
1924 /* Sequence numbers must match. */
1925 if (MSEQNO_LE(al_entry->mft_reference) !=
1926 ni->seq_no)
1927 goto em_put_err_out;
1928 /* Got it. All is ok. We can stop now. */
1929 break;
1930 }
1931 }
1932 }
1933
1934 ntfs_attr_reinit_search_ctx(ctx);
1935
1936 /* Now load all attribute extents. */
1937 attr = NULL;
1938 next_vcn = last_vcn = highest_vcn = 0;
1939 while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0,
1940 ctx))) {
1941 runlist_element *nrl;
1942
1943 /* Cache the current attribute. */
1944 attr = ctx->attr;
1945 /* $MFT must be non-resident. */
1946 if (!attr->non_resident) {
1947 ntfs_error(sb, "$MFT must be non-resident but a "
1948 "resident extent was found. $MFT is "
1949 "corrupt. Run chkdsk.");
1950 goto put_err_out;
1951 }
1952 /* $MFT must be uncompressed and unencrypted. */
1953 if (attr->flags & ATTR_COMPRESSION_MASK ||
1954 attr->flags & ATTR_IS_ENCRYPTED ||
1955 attr->flags & ATTR_IS_SPARSE) {
1956 ntfs_error(sb, "$MFT must be uncompressed, "
1957 "non-sparse, and unencrypted but a "
1958 "compressed/sparse/encrypted extent "
1959 "was found. $MFT is corrupt. Run "
1960 "chkdsk.");
1961 goto put_err_out;
1962 }
1963 /*
1964 * Decompress the mapping pairs array of this extent and merge
1965 * the result into the existing runlist. No need for locking
1966 * as we have exclusive access to the inode at this time and we
1967 * are a mount in progress task, too.
1968 */
1969 nrl = ntfs_mapping_pairs_decompress(vol, attr, ni->runlist.rl);
1970 if (IS_ERR(nrl)) {
1971 ntfs_error(sb, "ntfs_mapping_pairs_decompress() "
1972 "failed with error code %ld. $MFT is "
1973 "corrupt.", PTR_ERR(nrl));
1974 goto put_err_out;
1975 }
1976 ni->runlist.rl = nrl;
1977
1978 /* Are we in the first extent? */
1979 if (!next_vcn) {
1980 if (attr->data.non_resident.lowest_vcn) {
1981 ntfs_error(sb, "First extent of $DATA "
1982 "attribute has non zero "
1983 "lowest_vcn. $MFT is corrupt. "
1984 "You should run chkdsk.");
1985 goto put_err_out;
1986 }
1987 /* Get the last vcn in the $DATA attribute. */
1988 last_vcn = sle64_to_cpu(
1989 attr->data.non_resident.allocated_size)
1990 >> vol->cluster_size_bits;
1991 /* Fill in the inode size. */
1992 vi->i_size = sle64_to_cpu(
1993 attr->data.non_resident.data_size);
1994 ni->initialized_size = sle64_to_cpu(attr->data.
1995 non_resident.initialized_size);
1996 ni->allocated_size = sle64_to_cpu(
1997 attr->data.non_resident.allocated_size);
1998 /*
1999 * Verify the number of mft records does not exceed
2000 * 2^32 - 1.
2001 */
2002 if ((vi->i_size >> vol->mft_record_size_bits) >=
2003 (1ULL << 32)) {
2004 ntfs_error(sb, "$MFT is too big! Aborting.");
2005 goto put_err_out;
2006 }
2007 /*
2008 * We have got the first extent of the runlist for
2009 * $MFT which means it is now relatively safe to call
2010 * the normal ntfs_read_inode() function.
2011 * Complete reading the inode, this will actually
2012 * re-read the mft record for $MFT, this time entering
2013 * it into the page cache with which we complete the
2014 * kick start of the volume. It should be safe to do
2015 * this now as the first extent of $MFT/$DATA is
2016 * already known and we would hope that we don't need
2017 * further extents in order to find the other
2018 * attributes belonging to $MFT. Only time will tell if
2019 * this is really the case. If not we will have to play
2020 * magic at this point, possibly duplicating a lot of
2021 * ntfs_read_inode() at this point. We will need to
2022 * ensure we do enough of its work to be able to call
2023 * ntfs_read_inode() on extents of $MFT/$DATA. But lets
2024 * hope this never happens...
2025 */
2026 ntfs_read_locked_inode(vi);
2027 if (is_bad_inode(vi)) {
2028 ntfs_error(sb, "ntfs_read_inode() of $MFT "
2029 "failed. BUG or corrupt $MFT. "
2030 "Run chkdsk and if no errors "
2031 "are found, please report you "
2032 "saw this message to "
2033 "linux-ntfs-dev@lists."
2034 "sourceforge.net");
2035 ntfs_attr_put_search_ctx(ctx);
2036 /* Revert to the safe super operations. */
2037 ntfs_free(m);
2038 return -1;
2039 }
2040 /*
2041 * Re-initialize some specifics about $MFT's inode as
2042 * ntfs_read_inode() will have set up the default ones.
2043 */
2044 /* Set uid and gid to root. */
2045 vi->i_uid = vi->i_gid = 0;
2046 /* Regular file. No access for anyone. */
2047 vi->i_mode = S_IFREG;
2048 /* No VFS initiated operations allowed for $MFT. */
2049 vi->i_op = &ntfs_empty_inode_ops;
2050 vi->i_fop = &ntfs_empty_file_ops;
2051 }
2052
2053 /* Get the lowest vcn for the next extent. */
2054 highest_vcn = sle64_to_cpu(attr->data.non_resident.highest_vcn);
2055 next_vcn = highest_vcn + 1;
2056
2057 /* Only one extent or error, which we catch below. */
2058 if (next_vcn <= 0)
2059 break;
2060
2061 /* Avoid endless loops due to corruption. */
2062 if (next_vcn < sle64_to_cpu(
2063 attr->data.non_resident.lowest_vcn)) {
2064 ntfs_error(sb, "$MFT has corrupt attribute list "
2065 "attribute. Run chkdsk.");
2066 goto put_err_out;
2067 }
2068 }
2069 if (err != -ENOENT) {
2070 ntfs_error(sb, "Failed to lookup $MFT/$DATA attribute extent. "
2071 "$MFT is corrupt. Run chkdsk.");
2072 goto put_err_out;
2073 }
2074 if (!attr) {
2075 ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is "
2076 "corrupt. Run chkdsk.");
2077 goto put_err_out;
2078 }
2079 if (highest_vcn && highest_vcn != last_vcn - 1) {
2080 ntfs_error(sb, "Failed to load the complete runlist for "
2081 "$MFT/$DATA. Driver bug or corrupt $MFT. "
2082 "Run chkdsk.");
2083 ntfs_debug("highest_vcn = 0x%llx, last_vcn - 1 = 0x%llx",
2084 (unsigned long long)highest_vcn,
2085 (unsigned long long)last_vcn - 1);
2086 goto put_err_out;
2087 }
2088 ntfs_attr_put_search_ctx(ctx);
2089 ntfs_debug("Done.");
2090 ntfs_free(m);
2091 return 0;
2092
2093em_put_err_out:
2094 ntfs_error(sb, "Couldn't find first extent of $DATA attribute in "
2095 "attribute list. $MFT is corrupt. Run chkdsk.");
2096put_err_out:
2097 ntfs_attr_put_search_ctx(ctx);
2098err_out:
2099 ntfs_error(sb, "Failed. Marking inode as bad.");
2100 make_bad_inode(vi);
2101 ntfs_free(m);
2102 return -1;
2103}
2104
2105/**
2106 * ntfs_put_inode - handler for when the inode reference count is decremented
2107 * @vi: vfs inode
2108 *
2109 * The VFS calls ntfs_put_inode() every time the inode reference count (i_count)
2110 * is about to be decremented (but before the decrement itself.
2111 *
2112 * If the inode @vi is a directory with two references, one of which is being
2113 * dropped, we need to put the attribute inode for the directory index bitmap,
2114 * if it is present, otherwise the directory inode would remain pinned for
2115 * ever.
2116 */
2117void ntfs_put_inode(struct inode *vi)
2118{
2119 if (S_ISDIR(vi->i_mode) && atomic_read(&vi->i_count) == 2) {
2120 ntfs_inode *ni = NTFS_I(vi);
2121 if (NInoIndexAllocPresent(ni)) {
2122 struct inode *bvi = NULL;
2123 down(&vi->i_sem);
2124 if (atomic_read(&vi->i_count) == 2) {
2125 bvi = ni->itype.index.bmp_ino;
2126 if (bvi)
2127 ni->itype.index.bmp_ino = NULL;
2128 }
2129 up(&vi->i_sem);
2130 if (bvi)
2131 iput(bvi);
2132 }
2133 }
2134}
2135
2136static void __ntfs_clear_inode(ntfs_inode *ni)
2137{
2138 /* Free all alocated memory. */
2139 down_write(&ni->runlist.lock);
2140 if (ni->runlist.rl) {
2141 ntfs_free(ni->runlist.rl);
2142 ni->runlist.rl = NULL;
2143 }
2144 up_write(&ni->runlist.lock);
2145
2146 if (ni->attr_list) {
2147 ntfs_free(ni->attr_list);
2148 ni->attr_list = NULL;
2149 }
2150
2151 down_write(&ni->attr_list_rl.lock);
2152 if (ni->attr_list_rl.rl) {
2153 ntfs_free(ni->attr_list_rl.rl);
2154 ni->attr_list_rl.rl = NULL;
2155 }
2156 up_write(&ni->attr_list_rl.lock);
2157
2158 if (ni->name_len && ni->name != I30) {
2159 /* Catch bugs... */
2160 BUG_ON(!ni->name);
2161 kfree(ni->name);
2162 }
2163}
2164
2165void ntfs_clear_extent_inode(ntfs_inode *ni)
2166{
2167 ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
2168
2169 BUG_ON(NInoAttr(ni));
2170 BUG_ON(ni->nr_extents != -1);
2171
2172#ifdef NTFS_RW
2173 if (NInoDirty(ni)) {
2174 if (!is_bad_inode(VFS_I(ni->ext.base_ntfs_ino)))
2175 ntfs_error(ni->vol->sb, "Clearing dirty extent inode! "
2176 "Losing data! This is a BUG!!!");
2177 // FIXME: Do something!!!
2178 }
2179#endif /* NTFS_RW */
2180
2181 __ntfs_clear_inode(ni);
2182
2183 /* Bye, bye... */
2184 ntfs_destroy_extent_inode(ni);
2185}
2186
2187/**
2188 * ntfs_clear_big_inode - clean up the ntfs specific part of an inode
2189 * @vi: vfs inode pending annihilation
2190 *
2191 * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode()
2192 * is called, which deallocates all memory belonging to the NTFS specific part
2193 * of the inode and returns.
2194 *
2195 * If the MFT record is dirty, we commit it before doing anything else.
2196 */
2197void ntfs_clear_big_inode(struct inode *vi)
2198{
2199 ntfs_inode *ni = NTFS_I(vi);
2200
2201 /*
2202 * If the inode @vi is an index inode we need to put the attribute
2203 * inode for the index bitmap, if it is present, otherwise the index
2204 * inode would disappear and the attribute inode for the index bitmap
2205 * would no longer be referenced from anywhere and thus it would remain
2206 * pinned for ever.
2207 */
2208 if (NInoAttr(ni) && (ni->type == AT_INDEX_ALLOCATION) &&
2209 NInoIndexAllocPresent(ni) && ni->itype.index.bmp_ino) {
2210 iput(ni->itype.index.bmp_ino);
2211 ni->itype.index.bmp_ino = NULL;
2212 }
2213#ifdef NTFS_RW
2214 if (NInoDirty(ni)) {
2215 BOOL was_bad = (is_bad_inode(vi));
2216
2217 /* Committing the inode also commits all extent inodes. */
2218 ntfs_commit_inode(vi);
2219
2220 if (!was_bad && (is_bad_inode(vi) || NInoDirty(ni))) {
2221 ntfs_error(vi->i_sb, "Failed to commit dirty inode "
2222 "0x%lx. Losing data!", vi->i_ino);
2223 // FIXME: Do something!!!
2224 }
2225 }
2226#endif /* NTFS_RW */
2227
2228 /* No need to lock at this stage as no one else has a reference. */
2229 if (ni->nr_extents > 0) {
2230 int i;
2231
2232 for (i = 0; i < ni->nr_extents; i++)
2233 ntfs_clear_extent_inode(ni->ext.extent_ntfs_inos[i]);
2234 kfree(ni->ext.extent_ntfs_inos);
2235 }
2236
2237 __ntfs_clear_inode(ni);
2238
2239 if (NInoAttr(ni)) {
2240 /* Release the base inode if we are holding it. */
2241 if (ni->nr_extents == -1) {
2242 iput(VFS_I(ni->ext.base_ntfs_ino));
2243 ni->nr_extents = 0;
2244 ni->ext.base_ntfs_ino = NULL;
2245 }
2246 }
2247 return;
2248}
2249
2250/**
2251 * ntfs_show_options - show mount options in /proc/mounts
2252 * @sf: seq_file in which to write our mount options
2253 * @mnt: vfs mount whose mount options to display
2254 *
2255 * Called by the VFS once for each mounted ntfs volume when someone reads
2256 * /proc/mounts in order to display the NTFS specific mount options of each
2257 * mount. The mount options of the vfs mount @mnt are written to the seq file
2258 * @sf and success is returned.
2259 */
2260int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt)
2261{
2262 ntfs_volume *vol = NTFS_SB(mnt->mnt_sb);
2263 int i;
2264
2265 seq_printf(sf, ",uid=%i", vol->uid);
2266 seq_printf(sf, ",gid=%i", vol->gid);
2267 if (vol->fmask == vol->dmask)
2268 seq_printf(sf, ",umask=0%o", vol->fmask);
2269 else {
2270 seq_printf(sf, ",fmask=0%o", vol->fmask);
2271 seq_printf(sf, ",dmask=0%o", vol->dmask);
2272 }
2273 seq_printf(sf, ",nls=%s", vol->nls_map->charset);
2274 if (NVolCaseSensitive(vol))
2275 seq_printf(sf, ",case_sensitive");
2276 if (NVolShowSystemFiles(vol))
2277 seq_printf(sf, ",show_sys_files");
2278 for (i = 0; on_errors_arr[i].val; i++) {
2279 if (on_errors_arr[i].val & vol->on_errors)
2280 seq_printf(sf, ",errors=%s", on_errors_arr[i].str);
2281 }
2282 seq_printf(sf, ",mft_zone_multiplier=%i", vol->mft_zone_multiplier);
2283 return 0;
2284}
2285
2286#ifdef NTFS_RW
2287
2288/**
2289 * ntfs_truncate - called when the i_size of an ntfs inode is changed
2290 * @vi: inode for which the i_size was changed
2291 *
2292 * We do not support i_size changes yet.
2293 *
2294 * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and
2295 * that the change is allowed.
2296 *
2297 * This implies for us that @vi is a file inode rather than a directory, index,
2298 * or attribute inode as well as that @vi is a base inode.
2299 *
2300 * Returns 0 on success or -errno on error.
2301 *
2302 * Called with ->i_sem held. In all but one case ->i_alloc_sem is held for
2303 * writing. The only case where ->i_alloc_sem is not held is
2304 * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called
2305 * with the current i_size as the offset which means that it is a noop as far
2306 * as ntfs_truncate() is concerned.
2307 */
2308int ntfs_truncate(struct inode *vi)
2309{
2310 ntfs_inode *ni = NTFS_I(vi);
2311 ntfs_volume *vol = ni->vol;
2312 ntfs_attr_search_ctx *ctx;
2313 MFT_RECORD *m;
2314 const char *te = " Leaving file length out of sync with i_size.";
2315 int err;
2316
2317 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
2318 BUG_ON(NInoAttr(ni));
2319 BUG_ON(ni->nr_extents < 0);
2320 m = map_mft_record(ni);
2321 if (IS_ERR(m)) {
2322 err = PTR_ERR(m);
2323 ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx "
2324 "(error code %d).%s", vi->i_ino, err, te);
2325 ctx = NULL;
2326 m = NULL;
2327 goto err_out;
2328 }
2329 ctx = ntfs_attr_get_search_ctx(ni, m);
2330 if (unlikely(!ctx)) {
2331 ntfs_error(vi->i_sb, "Failed to allocate a search context for "
2332 "inode 0x%lx (not enough memory).%s",
2333 vi->i_ino, te);
2334 err = -ENOMEM;
2335 goto err_out;
2336 }
2337 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
2338 CASE_SENSITIVE, 0, NULL, 0, ctx);
2339 if (unlikely(err)) {
2340 if (err == -ENOENT)
2341 ntfs_error(vi->i_sb, "Open attribute is missing from "
2342 "mft record. Inode 0x%lx is corrupt. "
2343 "Run chkdsk.", vi->i_ino);
2344 else
2345 ntfs_error(vi->i_sb, "Failed to lookup attribute in "
2346 "inode 0x%lx (error code %d).",
2347 vi->i_ino, err);
2348 goto err_out;
2349 }
2350 /* If the size has not changed there is nothing to do. */
2351 if (ntfs_attr_size(ctx->attr) == i_size_read(vi))
2352 goto done;
2353 // TODO: Implement the truncate...
2354 ntfs_error(vi->i_sb, "Inode size has changed but this is not "
2355 "implemented yet. Resetting inode size to old value. "
2356 " This is most likely a bug in the ntfs driver!");
2357 i_size_write(vi, ntfs_attr_size(ctx->attr));
2358done:
2359 ntfs_attr_put_search_ctx(ctx);
2360 unmap_mft_record(ni);
2361 NInoClearTruncateFailed(ni);
2362 ntfs_debug("Done.");
2363 return 0;
2364err_out:
2365 if (err != -ENOMEM) {
2366 NVolSetErrors(vol);
2367 make_bad_inode(vi);
2368 }
2369 if (ctx)
2370 ntfs_attr_put_search_ctx(ctx);
2371 if (m)
2372 unmap_mft_record(ni);
2373 NInoSetTruncateFailed(ni);
2374 return err;
2375}
2376
2377/**
2378 * ntfs_truncate_vfs - wrapper for ntfs_truncate() that has no return value
2379 * @vi: inode for which the i_size was changed
2380 *
2381 * Wrapper for ntfs_truncate() that has no return value.
2382 *
2383 * See ntfs_truncate() description above for details.
2384 */
2385void ntfs_truncate_vfs(struct inode *vi) {
2386 ntfs_truncate(vi);
2387}
2388
2389/**
2390 * ntfs_setattr - called from notify_change() when an attribute is being changed
2391 * @dentry: dentry whose attributes to change
2392 * @attr: structure describing the attributes and the changes
2393 *
2394 * We have to trap VFS attempts to truncate the file described by @dentry as
2395 * soon as possible, because we do not implement changes in i_size yet. So we
2396 * abort all i_size changes here.
2397 *
2398 * We also abort all changes of user, group, and mode as we do not implement
2399 * the NTFS ACLs yet.
2400 *
2401 * Called with ->i_sem held. For the ATTR_SIZE (i.e. ->truncate) case, also
2402 * called with ->i_alloc_sem held for writing.
2403 *
2404 * Basically this is a copy of generic notify_change() and inode_setattr()
2405 * functionality, except we intercept and abort changes in i_size.
2406 */
2407int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
2408{
2409 struct inode *vi = dentry->d_inode;
2410 int err;
2411 unsigned int ia_valid = attr->ia_valid;
2412
2413 err = inode_change_ok(vi, attr);
2414 if (err)
2415 return err;
2416
2417 /* We do not support NTFS ACLs yet. */
2418 if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) {
2419 ntfs_warning(vi->i_sb, "Changes in user/group/mode are not "
2420 "supported yet, ignoring.");
2421 err = -EOPNOTSUPP;
2422 goto out;
2423 }
2424
2425 if (ia_valid & ATTR_SIZE) {
2426 if (attr->ia_size != i_size_read(vi)) {
2427 ntfs_warning(vi->i_sb, "Changes in inode size are not "
2428 "supported yet, ignoring.");
2429 err = -EOPNOTSUPP;
2430 // TODO: Implement...
2431 // err = vmtruncate(vi, attr->ia_size);
2432 if (err || ia_valid == ATTR_SIZE)
2433 goto out;
2434 } else {
2435 /*
2436 * We skipped the truncate but must still update
2437 * timestamps.
2438 */
2439 ia_valid |= ATTR_MTIME|ATTR_CTIME;
2440 }
2441 }
2442
2443 if (ia_valid & ATTR_ATIME)
2444 vi->i_atime = attr->ia_atime;
2445 if (ia_valid & ATTR_MTIME)
2446 vi->i_mtime = attr->ia_mtime;
2447 if (ia_valid & ATTR_CTIME)
2448 vi->i_ctime = attr->ia_ctime;
2449 mark_inode_dirty(vi);
2450out:
2451 return err;
2452}
2453
2454/**
2455 * ntfs_write_inode - write out a dirty inode
2456 * @vi: inode to write out
2457 * @sync: if true, write out synchronously
2458 *
2459 * Write out a dirty inode to disk including any extent inodes if present.
2460 *
2461 * If @sync is true, commit the inode to disk and wait for io completion. This
2462 * is done using write_mft_record().
2463 *
2464 * If @sync is false, just schedule the write to happen but do not wait for i/o
2465 * completion. In 2.6 kernels, scheduling usually happens just by virtue of
2466 * marking the page (and in this case mft record) dirty but we do not implement
2467 * this yet as write_mft_record() largely ignores the @sync parameter and
2468 * always performs synchronous writes.
2469 *
2470 * Return 0 on success and -errno on error.
2471 */
2472int ntfs_write_inode(struct inode *vi, int sync)
2473{
2474 sle64 nt;
2475 ntfs_inode *ni = NTFS_I(vi);
2476 ntfs_attr_search_ctx *ctx;
2477 MFT_RECORD *m;
2478 STANDARD_INFORMATION *si;
2479 int err = 0;
2480 BOOL modified = FALSE;
2481
2482 ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "",
2483 vi->i_ino);
2484 /*
2485 * Dirty attribute inodes are written via their real inodes so just
2486 * clean them here. Access time updates are taken care off when the
2487 * real inode is written.
2488 */
2489 if (NInoAttr(ni)) {
2490 NInoClearDirty(ni);
2491 ntfs_debug("Done.");
2492 return 0;
2493 }
2494 /* Map, pin, and lock the mft record belonging to the inode. */
2495 m = map_mft_record(ni);
2496 if (IS_ERR(m)) {
2497 err = PTR_ERR(m);
2498 goto err_out;
2499 }
2500 /* Update the access times in the standard information attribute. */
2501 ctx = ntfs_attr_get_search_ctx(ni, m);
2502 if (unlikely(!ctx)) {
2503 err = -ENOMEM;
2504 goto unm_err_out;
2505 }
2506 err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0,
2507 CASE_SENSITIVE, 0, NULL, 0, ctx);
2508 if (unlikely(err)) {
2509 ntfs_attr_put_search_ctx(ctx);
2510 goto unm_err_out;
2511 }
2512 si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
2513 le16_to_cpu(ctx->attr->data.resident.value_offset));
2514 /* Update the access times if they have changed. */
2515 nt = utc2ntfs(vi->i_mtime);
2516 if (si->last_data_change_time != nt) {
2517 ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
2518 "new = 0x%llx", vi->i_ino,
2519 sle64_to_cpu(si->last_data_change_time),
2520 sle64_to_cpu(nt));
2521 si->last_data_change_time = nt;
2522 modified = TRUE;
2523 }
2524 nt = utc2ntfs(vi->i_ctime);
2525 if (si->last_mft_change_time != nt) {
2526 ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
2527 "new = 0x%llx", vi->i_ino,
2528 sle64_to_cpu(si->last_mft_change_time),
2529 sle64_to_cpu(nt));
2530 si->last_mft_change_time = nt;
2531 modified = TRUE;
2532 }
2533 nt = utc2ntfs(vi->i_atime);
2534 if (si->last_access_time != nt) {
2535 ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
2536 "new = 0x%llx", vi->i_ino,
2537 sle64_to_cpu(si->last_access_time),
2538 sle64_to_cpu(nt));
2539 si->last_access_time = nt;
2540 modified = TRUE;
2541 }
2542 /*
2543 * If we just modified the standard information attribute we need to
2544 * mark the mft record it is in dirty. We do this manually so that
2545 * mark_inode_dirty() is not called which would redirty the inode and
2546 * hence result in an infinite loop of trying to write the inode.
2547 * There is no need to mark the base inode nor the base mft record
2548 * dirty, since we are going to write this mft record below in any case
2549 * and the base mft record may actually not have been modified so it
2550 * might not need to be written out.
2551 * NOTE: It is not a problem when the inode for $MFT itself is being
2552 * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES
2553 * on the $MFT inode and hence ntfs_write_inode() will not be
2554 * re-invoked because of it which in turn is ok since the dirtied mft
2555 * record will be cleaned and written out to disk below, i.e. before
2556 * this function returns.
2557 */
2558 if (modified && !NInoTestSetDirty(ctx->ntfs_ino))
2559 mark_ntfs_record_dirty(ctx->ntfs_ino->page,
2560 ctx->ntfs_ino->page_ofs);
2561 ntfs_attr_put_search_ctx(ctx);
2562 /* Now the access times are updated, write the base mft record. */
2563 if (NInoDirty(ni))
2564 err = write_mft_record(ni, m, sync);
2565 /* Write all attached extent mft records. */
2566 down(&ni->extent_lock);
2567 if (ni->nr_extents > 0) {
2568 ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos;
2569 int i;
2570
2571 ntfs_debug("Writing %i extent inodes.", ni->nr_extents);
2572 for (i = 0; i < ni->nr_extents; i++) {
2573 ntfs_inode *tni = extent_nis[i];
2574
2575 if (NInoDirty(tni)) {
2576 MFT_RECORD *tm = map_mft_record(tni);
2577 int ret;
2578
2579 if (IS_ERR(tm)) {
2580 if (!err || err == -ENOMEM)
2581 err = PTR_ERR(tm);
2582 continue;
2583 }
2584 ret = write_mft_record(tni, tm, sync);
2585 unmap_mft_record(tni);
2586 if (unlikely(ret)) {
2587 if (!err || err == -ENOMEM)
2588 err = ret;
2589 }
2590 }
2591 }
2592 }
2593 up(&ni->extent_lock);
2594 unmap_mft_record(ni);
2595 if (unlikely(err))
2596 goto err_out;
2597 ntfs_debug("Done.");
2598 return 0;
2599unm_err_out:
2600 unmap_mft_record(ni);
2601err_out:
2602 if (err == -ENOMEM) {
2603 ntfs_warning(vi->i_sb, "Not enough memory to write inode. "
2604 "Marking the inode dirty again, so the VFS "
2605 "retries later.");
2606 mark_inode_dirty(vi);
2607 } else {
2608 ntfs_error(vi->i_sb, "Failed (error code %i): Marking inode "
2609 "as bad. You should run chkdsk.", -err);
2610 make_bad_inode(vi);
2611 NVolSetErrors(ni->vol);
2612 }
2613 return err;
2614}
2615
2616#endif /* NTFS_RW */
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
new file mode 100644
index 000000000000..99580455f2ed
--- /dev/null
+++ b/fs/ntfs/inode.h
@@ -0,0 +1,321 @@
1/*
2 * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of
3 * the Linux-NTFS project.
4 *
5 * Copyright (c) 2001-2004 Anton Altaparmakov
6 * Copyright (c) 2002 Richard Russon
7 *
8 * This program/include file is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as published
10 * by the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program/include file is distributed in the hope that it will be
14 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program (in the main directory of the Linux-NTFS
20 * distribution in the file COPYING); if not, write to the Free Software
21 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24#ifndef _LINUX_NTFS_INODE_H
25#define _LINUX_NTFS_INODE_H
26
27#include <linux/mm.h>
28#include <linux/fs.h>
29#include <linux/seq_file.h>
30#include <linux/list.h>
31#include <asm/atomic.h>
32#include <asm/semaphore.h>
33
34#include "layout.h"
35#include "volume.h"
36#include "types.h"
37#include "runlist.h"
38#include "debug.h"
39
40typedef struct _ntfs_inode ntfs_inode;
41
42/*
43 * The NTFS in-memory inode structure. It is just used as an extension to the
44 * fields already provided in the VFS inode.
45 */
46struct _ntfs_inode {
47 s64 initialized_size; /* Copy from the attribute record. */
48 s64 allocated_size; /* Copy from the attribute record. */
49 unsigned long state; /* NTFS specific flags describing this inode.
50 See ntfs_inode_state_bits below. */
51 unsigned long mft_no; /* Number of the mft record / inode. */
52 u16 seq_no; /* Sequence number of the mft record. */
53 atomic_t count; /* Inode reference count for book keeping. */
54 ntfs_volume *vol; /* Pointer to the ntfs volume of this inode. */
55 /*
56 * If NInoAttr() is true, the below fields describe the attribute which
57 * this fake inode belongs to. The actual inode of this attribute is
58 * pointed to by base_ntfs_ino and nr_extents is always set to -1 (see
59 * below). For real inodes, we also set the type (AT_DATA for files and
60 * AT_INDEX_ALLOCATION for directories), with the name = NULL and
61 * name_len = 0 for files and name = I30 (global constant) and
62 * name_len = 4 for directories.
63 */
64 ATTR_TYPE type; /* Attribute type of this fake inode. */
65 ntfschar *name; /* Attribute name of this fake inode. */
66 u32 name_len; /* Attribute name length of this fake inode. */
67 runlist runlist; /* If state has the NI_NonResident bit set,
68 the runlist of the unnamed data attribute
69 (if a file) or of the index allocation
70 attribute (directory) or of the attribute
71 described by the fake inode (if NInoAttr()).
72 If runlist.rl is NULL, the runlist has not
73 been read in yet or has been unmapped. If
74 NI_NonResident is clear, the attribute is
75 resident (file and fake inode) or there is
76 no $I30 index allocation attribute
77 (small directory). In the latter case
78 runlist.rl is always NULL.*/
79 /*
80 * The following fields are only valid for real inodes and extent
81 * inodes.
82 */
83 struct semaphore mrec_lock; /* Lock for serializing access to the
84 mft record belonging to this inode. */
85 struct page *page; /* The page containing the mft record of the
86 inode. This should only be touched by the
87 (un)map_mft_record*() functions. */
88 int page_ofs; /* Offset into the page at which the mft record
89 begins. This should only be touched by the
90 (un)map_mft_record*() functions. */
91 /*
92 * Attribute list support (only for use by the attribute lookup
93 * functions). Setup during read_inode for all inodes with attribute
94 * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is
95 * further only valid if NI_AttrListNonResident is set.
96 */
97 u32 attr_list_size; /* Length of attribute list value in bytes. */
98 u8 *attr_list; /* Attribute list value itself. */
99 runlist attr_list_rl; /* Run list for the attribute list value. */
100 union {
101 struct { /* It is a directory, $MFT, or an index inode. */
102 struct inode *bmp_ino; /* Attribute inode for the
103 index $BITMAP. */
104 u32 block_size; /* Size of an index block. */
105 u32 vcn_size; /* Size of a vcn in this
106 index. */
107 COLLATION_RULE collation_rule; /* The collation rule
108 for the index. */
109 u8 block_size_bits; /* Log2 of the above. */
110 u8 vcn_size_bits; /* Log2 of the above. */
111 } index;
112 struct { /* It is a compressed file or an attribute inode. */
113 s64 size; /* Copy of compressed_size from
114 $DATA. */
115 u32 block_size; /* Size of a compression block
116 (cb). */
117 u8 block_size_bits; /* Log2 of the size of a cb. */
118 u8 block_clusters; /* Number of clusters per cb. */
119 } compressed;
120 } itype;
121 struct semaphore extent_lock; /* Lock for accessing/modifying the
122 below . */
123 s32 nr_extents; /* For a base mft record, the number of attached extent
124 inodes (0 if none), for extent records and for fake
125 inodes describing an attribute this is -1. */
126 union { /* This union is only used if nr_extents != 0. */
127 ntfs_inode **extent_ntfs_inos; /* For nr_extents > 0, array of
128 the ntfs inodes of the extent
129 mft records belonging to
130 this base inode which have
131 been loaded. */
132 ntfs_inode *base_ntfs_ino; /* For nr_extents == -1, the
133 ntfs inode of the base mft
134 record. For fake inodes, the
135 real (base) inode to which
136 the attribute belongs. */
137 } ext;
138};
139
140/*
141 * Defined bits for the state field in the ntfs_inode structure.
142 * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only
143 */
144typedef enum {
145 NI_Dirty, /* 1: Mft record needs to be written to disk. */
146 NI_AttrList, /* 1: Mft record contains an attribute list. */
147 NI_AttrListNonResident, /* 1: Attribute list is non-resident. Implies
148 NI_AttrList is set. */
149
150 NI_Attr, /* 1: Fake inode for attribute i/o.
151 0: Real inode or extent inode. */
152
153 NI_MstProtected, /* 1: Attribute is protected by MST fixups.
154 0: Attribute is not protected by fixups. */
155 NI_NonResident, /* 1: Unnamed data attr is non-resident (f).
156 1: Attribute is non-resident (a). */
157 NI_IndexAllocPresent = NI_NonResident, /* 1: $I30 index alloc attr is
158 present (d). */
159 NI_Compressed, /* 1: Unnamed data attr is compressed (f).
160 1: Create compressed files by default (d).
161 1: Attribute is compressed (a). */
162 NI_Encrypted, /* 1: Unnamed data attr is encrypted (f).
163 1: Create encrypted files by default (d).
164 1: Attribute is encrypted (a). */
165 NI_Sparse, /* 1: Unnamed data attr is sparse (f).
166 1: Create sparse files by default (d).
167 1: Attribute is sparse (a). */
168 NI_TruncateFailed, /* 1: Last ntfs_truncate() call failed. */
169} ntfs_inode_state_bits;
170
171/*
172 * NOTE: We should be adding dirty mft records to a list somewhere and they
173 * should be independent of the (ntfs/vfs) inode structure so that an inode can
174 * be removed but the record can be left dirty for syncing later.
175 */
176
177/*
178 * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo()
179 * functions.
180 */
181#define NINO_FNS(flag) \
182static inline int NIno##flag(ntfs_inode *ni) \
183{ \
184 return test_bit(NI_##flag, &(ni)->state); \
185} \
186static inline void NInoSet##flag(ntfs_inode *ni) \
187{ \
188 set_bit(NI_##flag, &(ni)->state); \
189} \
190static inline void NInoClear##flag(ntfs_inode *ni) \
191{ \
192 clear_bit(NI_##flag, &(ni)->state); \
193}
194
195/*
196 * As above for NInoTestSetFoo() and NInoTestClearFoo().
197 */
198#define TAS_NINO_FNS(flag) \
199static inline int NInoTestSet##flag(ntfs_inode *ni) \
200{ \
201 return test_and_set_bit(NI_##flag, &(ni)->state); \
202} \
203static inline int NInoTestClear##flag(ntfs_inode *ni) \
204{ \
205 return test_and_clear_bit(NI_##flag, &(ni)->state); \
206}
207
208/* Emit the ntfs inode bitops functions. */
209NINO_FNS(Dirty)
210TAS_NINO_FNS(Dirty)
211NINO_FNS(AttrList)
212NINO_FNS(AttrListNonResident)
213NINO_FNS(Attr)
214NINO_FNS(MstProtected)
215NINO_FNS(NonResident)
216NINO_FNS(IndexAllocPresent)
217NINO_FNS(Compressed)
218NINO_FNS(Encrypted)
219NINO_FNS(Sparse)
220NINO_FNS(TruncateFailed)
221
222/*
223 * The full structure containing a ntfs_inode and a vfs struct inode. Used for
224 * all real and fake inodes but not for extent inodes which lack the vfs struct
225 * inode.
226 */
227typedef struct {
228 ntfs_inode ntfs_inode;
229 struct inode vfs_inode; /* The vfs inode structure. */
230} big_ntfs_inode;
231
232/**
233 * NTFS_I - return the ntfs inode given a vfs inode
234 * @inode: VFS inode
235 *
236 * NTFS_I() returns the ntfs inode associated with the VFS @inode.
237 */
238static inline ntfs_inode *NTFS_I(struct inode *inode)
239{
240 return (ntfs_inode *)list_entry(inode, big_ntfs_inode, vfs_inode);
241}
242
243static inline struct inode *VFS_I(ntfs_inode *ni)
244{
245 return &((big_ntfs_inode *)ni)->vfs_inode;
246}
247
248/**
249 * ntfs_attr - ntfs in memory attribute structure
250 * @mft_no: mft record number of the base mft record of this attribute
251 * @name: Unicode name of the attribute (NULL if unnamed)
252 * @name_len: length of @name in Unicode characters (0 if unnamed)
253 * @type: attribute type (see layout.h)
254 *
255 * This structure exists only to provide a small structure for the
256 * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism.
257 *
258 * NOTE: Elements are ordered by size to make the structure as compact as
259 * possible on all architectures.
260 */
261typedef struct {
262 unsigned long mft_no;
263 ntfschar *name;
264 u32 name_len;
265 ATTR_TYPE type;
266} ntfs_attr;
267
268typedef int (*test_t)(struct inode *, void *);
269
270extern int ntfs_test_inode(struct inode *vi, ntfs_attr *na);
271
272extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no);
273extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
274 ntfschar *name, u32 name_len);
275extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
276 u32 name_len);
277
278extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
279extern void ntfs_destroy_big_inode(struct inode *inode);
280extern void ntfs_clear_big_inode(struct inode *vi);
281
282extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);
283
284static inline void ntfs_init_big_inode(struct inode *vi)
285{
286 ntfs_inode *ni = NTFS_I(vi);
287
288 ntfs_debug("Entering.");
289 __ntfs_init_inode(vi->i_sb, ni);
290 ni->mft_no = vi->i_ino;
291}
292
293extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
294 unsigned long mft_no);
295extern void ntfs_clear_extent_inode(ntfs_inode *ni);
296
297extern int ntfs_read_inode_mount(struct inode *vi);
298
299extern void ntfs_put_inode(struct inode *vi);
300
301extern int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt);
302
303#ifdef NTFS_RW
304
305extern int ntfs_truncate(struct inode *vi);
306extern void ntfs_truncate_vfs(struct inode *vi);
307
308extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr);
309
310extern int ntfs_write_inode(struct inode *vi, int sync);
311
312static inline void ntfs_commit_inode(struct inode *vi)
313{
314 if (!is_bad_inode(vi))
315 ntfs_write_inode(vi, 1);
316 return;
317}
318
319#endif /* NTFS_RW */
320
321#endif /* _LINUX_NTFS_INODE_H */
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
new file mode 100644
index 000000000000..47b338999921
--- /dev/null
+++ b/fs/ntfs/layout.h
@@ -0,0 +1,2413 @@
1/*
2 * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS
3 * project.
4 *
5 * Copyright (c) 2001-2004 Anton Altaparmakov
6 * Copyright (c) 2002 Richard Russon
7 *
8 * This program/include file is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as published
10 * by the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program/include file is distributed in the hope that it will be
14 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program (in the main directory of the Linux-NTFS
20 * distribution in the file COPYING); if not, write to the Free Software
21 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24#ifndef _LINUX_NTFS_LAYOUT_H
25#define _LINUX_NTFS_LAYOUT_H
26
27#include <linux/types.h>
28#include <linux/bitops.h>
29#include <linux/list.h>
30#include <asm/byteorder.h>
31
32#include "types.h"
33
34/*
35 * Constant endianness conversion defines.
36 */
37#define const_le16_to_cpu(x) __constant_le16_to_cpu(x)
38#define const_le32_to_cpu(x) __constant_le32_to_cpu(x)
39#define const_le64_to_cpu(x) __constant_le64_to_cpu(x)
40
41#define const_cpu_to_le16(x) __constant_cpu_to_le16(x)
42#define const_cpu_to_le32(x) __constant_cpu_to_le32(x)
43#define const_cpu_to_le64(x) __constant_cpu_to_le64(x)
44
45/* The NTFS oem_id "NTFS " */
46#define magicNTFS const_cpu_to_le64(0x202020205346544eULL)
47
48/*
49 * Location of bootsector on partition:
50 * The standard NTFS_BOOT_SECTOR is on sector 0 of the partition.
51 * On NT4 and above there is one backup copy of the boot sector to
52 * be found on the last sector of the partition (not normally accessible
53 * from within Windows as the bootsector contained number of sectors
54 * value is one less than the actual value!).
55 * On versions of NT 3.51 and earlier, the backup copy was located at
56 * number of sectors/2 (integer divide), i.e. in the middle of the volume.
57 */
58
59/*
60 * BIOS parameter block (bpb) structure.
61 */
62typedef struct {
63 le16 bytes_per_sector; /* Size of a sector in bytes. */
64 u8 sectors_per_cluster; /* Size of a cluster in sectors. */
65 le16 reserved_sectors; /* zero */
66 u8 fats; /* zero */
67 le16 root_entries; /* zero */
68 le16 sectors; /* zero */
69 u8 media_type; /* 0xf8 = hard disk */
70 le16 sectors_per_fat; /* zero */
71 le16 sectors_per_track; /* irrelevant */
72 le16 heads; /* irrelevant */
73 le32 hidden_sectors; /* zero */
74 le32 large_sectors; /* zero */
75} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK;
76
77/*
78 * NTFS boot sector structure.
79 */
80typedef struct {
81 u8 jump[3]; /* Irrelevant (jump to boot up code).*/
82 le64 oem_id; /* Magic "NTFS ". */
83 BIOS_PARAMETER_BLOCK bpb; /* See BIOS_PARAMETER_BLOCK. */
84 u8 unused[4]; /* zero, NTFS diskedit.exe states that
85 this is actually:
86 __u8 physical_drive; // 0x80
87 __u8 current_head; // zero
88 __u8 extended_boot_signature;
89 // 0x80
90 __u8 unused; // zero
91 */
92/*0x28*/sle64 number_of_sectors; /* Number of sectors in volume. Gives
93 maximum volume size of 2^63 sectors.
94 Assuming standard sector size of 512
95 bytes, the maximum byte size is
96 approx. 4.7x10^21 bytes. (-; */
97 sle64 mft_lcn; /* Cluster location of mft data. */
98 sle64 mftmirr_lcn; /* Cluster location of copy of mft. */
99 s8 clusters_per_mft_record; /* Mft record size in clusters. */
100 u8 reserved0[3]; /* zero */
101 s8 clusters_per_index_record; /* Index block size in clusters. */
102 u8 reserved1[3]; /* zero */
103 le64 volume_serial_number; /* Irrelevant (serial number). */
104 le32 checksum; /* Boot sector checksum. */
105/*0x54*/u8 bootstrap[426]; /* Irrelevant (boot up code). */
106 le16 end_of_sector_marker; /* End of bootsector magic. Always is
107 0xaa55 in little endian. */
108/* sizeof() = 512 (0x200) bytes */
109} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR;
110
111/*
112 * Magic identifiers present at the beginning of all ntfs record containing
113 * records (like mft records for example).
114 */
115enum {
116 /* Found in $MFT/$DATA. */
117 magic_FILE = const_cpu_to_le32(0x454c4946), /* Mft entry. */
118 magic_INDX = const_cpu_to_le32(0x58444e49), /* Index buffer. */
119 magic_HOLE = const_cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
120
121 /* Found in $LogFile/$DATA. */
122 magic_RSTR = const_cpu_to_le32(0x52545352), /* Restart page. */
123 magic_RCRD = const_cpu_to_le32(0x44524352), /* Log record page. */
124
125 /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */
126 magic_CHKD = const_cpu_to_le32(0x424b4843), /* Modified by chkdsk. */
127
128 /* Found in all ntfs record containing records. */
129 magic_BAAD = const_cpu_to_le32(0x44414142), /* Failed multi sector
130 transfer was detected. */
131 /*
132 * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
133 * thus not initialized. Page must be initialized before using it.
134 */
135 magic_empty = const_cpu_to_le32(0xffffffff) /* Record is empty. */
136};
137
138typedef le32 NTFS_RECORD_TYPE;
139
140/*
141 * Generic magic comparison macros. Finally found a use for the ## preprocessor
142 * operator! (-8
143 */
144
145static inline BOOL __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r)
146{
147 return (x == r);
148}
149#define ntfs_is_magic(x, m) __ntfs_is_magic(x, magic_##m)
150
151static inline BOOL __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r)
152{
153 return (*p == r);
154}
155#define ntfs_is_magicp(p, m) __ntfs_is_magicp(p, magic_##m)
156
157/*
158 * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above.
159 */
160#define ntfs_is_file_record(x) ( ntfs_is_magic (x, FILE) )
161#define ntfs_is_file_recordp(p) ( ntfs_is_magicp(p, FILE) )
162#define ntfs_is_mft_record(x) ( ntfs_is_file_record (x) )
163#define ntfs_is_mft_recordp(p) ( ntfs_is_file_recordp(p) )
164#define ntfs_is_indx_record(x) ( ntfs_is_magic (x, INDX) )
165#define ntfs_is_indx_recordp(p) ( ntfs_is_magicp(p, INDX) )
166#define ntfs_is_hole_record(x) ( ntfs_is_magic (x, HOLE) )
167#define ntfs_is_hole_recordp(p) ( ntfs_is_magicp(p, HOLE) )
168
169#define ntfs_is_rstr_record(x) ( ntfs_is_magic (x, RSTR) )
170#define ntfs_is_rstr_recordp(p) ( ntfs_is_magicp(p, RSTR) )
171#define ntfs_is_rcrd_record(x) ( ntfs_is_magic (x, RCRD) )
172#define ntfs_is_rcrd_recordp(p) ( ntfs_is_magicp(p, RCRD) )
173
174#define ntfs_is_chkd_record(x) ( ntfs_is_magic (x, CHKD) )
175#define ntfs_is_chkd_recordp(p) ( ntfs_is_magicp(p, CHKD) )
176
177#define ntfs_is_baad_record(x) ( ntfs_is_magic (x, BAAD) )
178#define ntfs_is_baad_recordp(p) ( ntfs_is_magicp(p, BAAD) )
179
180#define ntfs_is_empty_record(x) ( ntfs_is_magic (x, empty) )
181#define ntfs_is_empty_recordp(p) ( ntfs_is_magicp(p, empty) )
182
183/*
184 * The Update Sequence Array (usa) is an array of the le16 values which belong
185 * to the end of each sector protected by the update sequence record in which
186 * this array is contained. Note that the first entry is the Update Sequence
187 * Number (usn), a cyclic counter of how many times the protected record has
188 * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All
189 * last le16's of each sector have to be equal to the usn (during reading) or
190 * are set to it (during writing). If they are not, an incomplete multi sector
191 * transfer has occurred when the data was written.
192 * The maximum size for the update sequence array is fixed to:
193 * maximum size = usa_ofs + (usa_count * 2) = 510 bytes
194 * The 510 bytes comes from the fact that the last le16 in the array has to
195 * (obviously) finish before the last le16 of the first 512-byte sector.
196 * This formula can be used as a consistency check in that usa_ofs +
197 * (usa_count * 2) has to be less than or equal to 510.
198 */
199typedef struct {
200 NTFS_RECORD_TYPE magic; /* A four-byte magic identifying the record
201 type and/or status. */
202 le16 usa_ofs; /* Offset to the Update Sequence Array (usa)
203 from the start of the ntfs record. */
204 le16 usa_count; /* Number of le16 sized entries in the usa
205 including the Update Sequence Number (usn),
206 thus the number of fixups is the usa_count
207 minus 1. */
208} __attribute__ ((__packed__)) NTFS_RECORD;
209
210/*
211 * System files mft record numbers. All these files are always marked as used
212 * in the bitmap attribute of the mft; presumably in order to avoid accidental
213 * allocation for random other mft records. Also, the sequence number for each
214 * of the system files is always equal to their mft record number and it is
215 * never modified.
216 */
217typedef enum {
218 FILE_MFT = 0, /* Master file table (mft). Data attribute
219 contains the entries and bitmap attribute
220 records which ones are in use (bit==1). */
221 FILE_MFTMirr = 1, /* Mft mirror: copy of first four mft records
222 in data attribute. If cluster size > 4kiB,
223 copy of first N mft records, with
224 N = cluster_size / mft_record_size. */
225 FILE_LogFile = 2, /* Journalling log in data attribute. */
226 FILE_Volume = 3, /* Volume name attribute and volume information
227 attribute (flags and ntfs version). Windows
228 refers to this file as volume DASD (Direct
229 Access Storage Device). */
230 FILE_AttrDef = 4, /* Array of attribute definitions in data
231 attribute. */
232 FILE_root = 5, /* Root directory. */
233 FILE_Bitmap = 6, /* Allocation bitmap of all clusters (lcns) in
234 data attribute. */
235 FILE_Boot = 7, /* Boot sector (always at cluster 0) in data
236 attribute. */
237 FILE_BadClus = 8, /* Contains all bad clusters in the non-resident
238 data attribute. */
239 FILE_Secure = 9, /* Shared security descriptors in data attribute
240 and two indexes into the descriptors.
241 Appeared in Windows 2000. Before that, this
242 file was named $Quota but was unused. */
243 FILE_UpCase = 10, /* Uppercase equivalents of all 65536 Unicode
244 characters in data attribute. */
245 FILE_Extend = 11, /* Directory containing other system files (eg.
246 $ObjId, $Quota, $Reparse and $UsnJrnl). This
247 is new to NTFS3.0. */
248 FILE_reserved12 = 12, /* Reserved for future use (records 12-15). */
249 FILE_reserved13 = 13,
250 FILE_reserved14 = 14,
251 FILE_reserved15 = 15,
252 FILE_first_user = 16, /* First user file, used as test limit for
253 whether to allow opening a file or not. */
254} NTFS_SYSTEM_FILES;
255
256/*
257 * These are the so far known MFT_RECORD_* flags (16-bit) which contain
258 * information about the mft record in which they are present.
259 */
260enum {
261 MFT_RECORD_IN_USE = const_cpu_to_le16(0x0001),
262 MFT_RECORD_IS_DIRECTORY = const_cpu_to_le16(0x0002),
263} __attribute__ ((__packed__));
264
265typedef le16 MFT_RECORD_FLAGS;
266
267/*
268 * mft references (aka file references or file record segment references) are
269 * used whenever a structure needs to refer to a record in the mft.
270 *
271 * A reference consists of a 48-bit index into the mft and a 16-bit sequence
272 * number used to detect stale references.
273 *
274 * For error reporting purposes we treat the 48-bit index as a signed quantity.
275 *
276 * The sequence number is a circular counter (skipping 0) describing how many
277 * times the referenced mft record has been (re)used. This has to match the
278 * sequence number of the mft record being referenced, otherwise the reference
279 * is considered stale and removed (FIXME: only ntfsck or the driver itself?).
280 *
281 * If the sequence number is zero it is assumed that no sequence number
282 * consistency checking should be performed.
283 *
284 * FIXME: Since inodes are 32-bit as of now, the driver needs to always check
285 * for high_part being 0 and if not either BUG(), cause a panic() or handle
286 * the situation in some other way. This shouldn't be a problem as a volume has
287 * to become HUGE in order to need more than 32-bits worth of mft records.
288 * Assuming the standard mft record size of 1kb only the records (never mind
289 * the non-resident attributes, etc.) would require 4Tb of space on their own
290 * for the first 32 bits worth of records. This is only if some strange person
291 * doesn't decide to foul play and make the mft sparse which would be a really
292 * horrible thing to do as it would trash our current driver implementation. )-:
293 * Do I hear screams "we want 64-bit inodes!" ?!? (-;
294 *
295 * FIXME: The mft zone is defined as the first 12% of the volume. This space is
296 * reserved so that the mft can grow contiguously and hence doesn't become
297 * fragmented. Volume free space includes the empty part of the mft zone and
298 * when the volume's free 88% are used up, the mft zone is shrunk by a factor
299 * of 2, thus making more space available for more files/data. This process is
300 * repeated everytime there is no more free space except for the mft zone until
301 * there really is no more free space.
302 */
303
304/*
305 * Typedef the MFT_REF as a 64-bit value for easier handling.
306 * Also define two unpacking macros to get to the reference (MREF) and
307 * sequence number (MSEQNO) respectively.
308 * The _LE versions are to be applied on little endian MFT_REFs.
309 * Note: The _LE versions will return a CPU endian formatted value!
310 */
311typedef enum {
312 MFT_REF_MASK_CPU = 0x0000ffffffffffffULL,
313 MFT_REF_MASK_LE = const_cpu_to_le64(0x0000ffffffffffffULL),
314} MFT_REF_CONSTS;
315
316typedef u64 MFT_REF;
317typedef le64 leMFT_REF;
318
319#define MK_MREF(m, s) ((MFT_REF)(((MFT_REF)(s) << 48) | \
320 ((MFT_REF)(m) & MFT_REF_MASK_CPU)))
321#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s))
322
323#define MREF(x) ((unsigned long)((x) & MFT_REF_MASK_CPU))
324#define MSEQNO(x) ((u16)(((x) >> 48) & 0xffff))
325#define MREF_LE(x) ((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU))
326#define MSEQNO_LE(x) ((u16)((le64_to_cpu(x) >> 48) & 0xffff))
327
328#define IS_ERR_MREF(x) (((x) & 0x0000800000000000ULL) ? 1 : 0)
329#define ERR_MREF(x) ((u64)((s64)(x)))
330#define MREF_ERR(x) ((int)((s64)(x)))
331
332/*
333 * The mft record header present at the beginning of every record in the mft.
334 * This is followed by a sequence of variable length attribute records which
335 * is terminated by an attribute of type AT_END which is a truncated attribute
336 * in that it only consists of the attribute type code AT_END and none of the
337 * other members of the attribute structure are present.
338 */
339typedef struct {
340/*Ofs*/
341/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
342 NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */
343 le16 usa_ofs; /* See NTFS_RECORD definition above. */
344 le16 usa_count; /* See NTFS_RECORD definition above. */
345
346/* 8*/ le64 lsn; /* $LogFile sequence number for this record.
347 Changed every time the record is modified. */
348/* 16*/ le16 sequence_number; /* Number of times this mft record has been
349 reused. (See description for MFT_REF
350 above.) NOTE: The increment (skipping zero)
351 is done when the file is deleted. NOTE: If
352 this is zero it is left zero. */
353/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of
354 directory entries referencing this record.
355 NOTE: Only used in mft base records.
356 NOTE: When deleting a directory entry we
357 check the link_count and if it is 1 we
358 delete the file. Otherwise we delete the
359 FILE_NAME_ATTR being referenced by the
360 directory entry from the mft record and
361 decrement the link_count.
362 FIXME: Careful with Win32 + DOS names! */
363/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this
364 mft record from the start of the mft record.
365 NOTE: Must be aligned to 8-byte boundary. */
366/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file
367 is deleted, the MFT_RECORD_IN_USE flag is
368 set to zero. */
369/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record.
370 NOTE: Must be aligned to 8-byte boundary. */
371/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft
372 record. This should be equal to the mft
373 record size. */
374/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records.
375 When it is not zero it is a mft reference
376 pointing to the base mft record to which
377 this record belongs (this is then used to
378 locate the attribute list attribute present
379 in the base record which describes this
380 extension record and hence might need
381 modification when the extension record
382 itself is modified, also locating the
383 attribute list also means finding the other
384 potential extents, belonging to the non-base
385 mft record). */
386/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to
387 the next attribute added to this mft record.
388 NOTE: Incremented each time after it is used.
389 NOTE: Every time the mft record is reused
390 this number is set to zero. NOTE: The first
391 instance number is always 0. */
392/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */
393/* 42*/ le16 reserved; /* Reserved/alignment. */
394/* 44*/ le32 mft_record_number; /* Number of this mft record. */
395/* sizeof() = 48 bytes */
396/*
397 * When (re)using the mft record, we place the update sequence array at this
398 * offset, i.e. before we start with the attributes. This also makes sense,
399 * otherwise we could run into problems with the update sequence array
400 * containing in itself the last two bytes of a sector which would mean that
401 * multi sector transfer protection wouldn't work. As you can't protect data
402 * by overwriting it since you then can't get it back...
403 * When reading we obviously use the data from the ntfs record header.
404 */
405} __attribute__ ((__packed__)) MFT_RECORD;
406
407/* This is the version without the NTFS 3.1+ specific fields. */
408typedef struct {
409/*Ofs*/
410/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
411 NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */
412 le16 usa_ofs; /* See NTFS_RECORD definition above. */
413 le16 usa_count; /* See NTFS_RECORD definition above. */
414
415/* 8*/ le64 lsn; /* $LogFile sequence number for this record.
416 Changed every time the record is modified. */
417/* 16*/ le16 sequence_number; /* Number of times this mft record has been
418 reused. (See description for MFT_REF
419 above.) NOTE: The increment (skipping zero)
420 is done when the file is deleted. NOTE: If
421 this is zero it is left zero. */
422/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of
423 directory entries referencing this record.
424 NOTE: Only used in mft base records.
425 NOTE: When deleting a directory entry we
426 check the link_count and if it is 1 we
427 delete the file. Otherwise we delete the
428 FILE_NAME_ATTR being referenced by the
429 directory entry from the mft record and
430 decrement the link_count.
431 FIXME: Careful with Win32 + DOS names! */
432/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this
433 mft record from the start of the mft record.
434 NOTE: Must be aligned to 8-byte boundary. */
435/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file
436 is deleted, the MFT_RECORD_IN_USE flag is
437 set to zero. */
438/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record.
439 NOTE: Must be aligned to 8-byte boundary. */
440/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft
441 record. This should be equal to the mft
442 record size. */
443/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records.
444 When it is not zero it is a mft reference
445 pointing to the base mft record to which
446 this record belongs (this is then used to
447 locate the attribute list attribute present
448 in the base record which describes this
449 extension record and hence might need
450 modification when the extension record
451 itself is modified, also locating the
452 attribute list also means finding the other
453 potential extents, belonging to the non-base
454 mft record). */
455/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to
456 the next attribute added to this mft record.
457 NOTE: Incremented each time after it is used.
458 NOTE: Every time the mft record is reused
459 this number is set to zero. NOTE: The first
460 instance number is always 0. */
461/* sizeof() = 42 bytes */
462/*
463 * When (re)using the mft record, we place the update sequence array at this
464 * offset, i.e. before we start with the attributes. This also makes sense,
465 * otherwise we could run into problems with the update sequence array
466 * containing in itself the last two bytes of a sector which would mean that
467 * multi sector transfer protection wouldn't work. As you can't protect data
468 * by overwriting it since you then can't get it back...
469 * When reading we obviously use the data from the ntfs record header.
470 */
471} __attribute__ ((__packed__)) MFT_RECORD_OLD;
472
473/*
474 * System defined attributes (32-bit). Each attribute type has a corresponding
475 * attribute name (Unicode string of maximum 64 character length) as described
476 * by the attribute definitions present in the data attribute of the $AttrDef
477 * system file. On NTFS 3.0 volumes the names are just as the types are named
478 * in the below defines exchanging AT_ for the dollar sign ($). If that is not
479 * a revealing choice of symbol I do not know what is... (-;
480 */
481enum {
482 AT_UNUSED = const_cpu_to_le32( 0),
483 AT_STANDARD_INFORMATION = const_cpu_to_le32( 0x10),
484 AT_ATTRIBUTE_LIST = const_cpu_to_le32( 0x20),
485 AT_FILE_NAME = const_cpu_to_le32( 0x30),
486 AT_OBJECT_ID = const_cpu_to_le32( 0x40),
487 AT_SECURITY_DESCRIPTOR = const_cpu_to_le32( 0x50),
488 AT_VOLUME_NAME = const_cpu_to_le32( 0x60),
489 AT_VOLUME_INFORMATION = const_cpu_to_le32( 0x70),
490 AT_DATA = const_cpu_to_le32( 0x80),
491 AT_INDEX_ROOT = const_cpu_to_le32( 0x90),
492 AT_INDEX_ALLOCATION = const_cpu_to_le32( 0xa0),
493 AT_BITMAP = const_cpu_to_le32( 0xb0),
494 AT_REPARSE_POINT = const_cpu_to_le32( 0xc0),
495 AT_EA_INFORMATION = const_cpu_to_le32( 0xd0),
496 AT_EA = const_cpu_to_le32( 0xe0),
497 AT_PROPERTY_SET = const_cpu_to_le32( 0xf0),
498 AT_LOGGED_UTILITY_STREAM = const_cpu_to_le32( 0x100),
499 AT_FIRST_USER_DEFINED_ATTRIBUTE = const_cpu_to_le32( 0x1000),
500 AT_END = const_cpu_to_le32(0xffffffff)
501};
502
503typedef le32 ATTR_TYPE;
504
505/*
506 * The collation rules for sorting views/indexes/etc (32-bit).
507 *
508 * COLLATION_BINARY - Collate by binary compare where the first byte is most
509 * significant.
510 * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary
511 * Unicode values, except that when a character can be uppercased, the
512 * upper case value collates before the lower case one.
513 * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation
514 * is done very much like COLLATION_UNICODE_STRING. In fact I have no idea
515 * what the difference is. Perhaps the difference is that file names
516 * would treat some special characters in an odd way (see
517 * unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[]
518 * for what I mean but COLLATION_UNICODE_STRING would not give any special
519 * treatment to any characters at all, but this is speculation.
520 * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key
521 * values. E.g. used for $SII index in FILE_Secure, which sorts by
522 * security_id (le32).
523 * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values.
524 * E.g. used for $O index in FILE_Extend/$Quota.
525 * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash
526 * values and second by ascending security_id values. E.g. used for $SDH
527 * index in FILE_Secure.
528 * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending
529 * le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which
530 * sorts by object_id (16-byte), by splitting up the object_id in four
531 * le32 values and using them as individual keys. E.g. take the following
532 * two security_ids, stored as follows on disk:
533 * 1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59
534 * 2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45
535 * To compare them, they are split into four le32 values each, like so:
536 * 1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081
537 * 2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179
538 * Now, it is apparent why the 2nd object_id collates after the 1st: the
539 * first le32 value of the 1st object_id is less than the first le32 of
540 * the 2nd object_id. If the first le32 values of both object_ids were
541 * equal then the second le32 values would be compared, etc.
542 */
543enum {
544 COLLATION_BINARY = const_cpu_to_le32(0x00),
545 COLLATION_FILE_NAME = const_cpu_to_le32(0x01),
546 COLLATION_UNICODE_STRING = const_cpu_to_le32(0x02),
547 COLLATION_NTOFS_ULONG = const_cpu_to_le32(0x10),
548 COLLATION_NTOFS_SID = const_cpu_to_le32(0x11),
549 COLLATION_NTOFS_SECURITY_HASH = const_cpu_to_le32(0x12),
550 COLLATION_NTOFS_ULONGS = const_cpu_to_le32(0x13)
551};
552
553typedef le32 COLLATION_RULE;
554
555/*
556 * The flags (32-bit) describing attribute properties in the attribute
557 * definition structure. FIXME: This information is from Regis's information
558 * and, according to him, it is not certain and probably incomplete.
559 * The INDEXABLE flag is fairly certainly correct as only the file name
560 * attribute has this flag set and this is the only attribute indexed in NT4.
561 */
562enum {
563 INDEXABLE = const_cpu_to_le32(0x02), /* Attribute can be
564 indexed. */
565 NEED_TO_REGENERATE = const_cpu_to_le32(0x40), /* Need to regenerate
566 during regeneration
567 phase. */
568 CAN_BE_NON_RESIDENT = const_cpu_to_le32(0x80), /* Attribute can be
569 non-resident. */
570};
571
572typedef le32 ATTR_DEF_FLAGS;
573
574/*
575 * The data attribute of FILE_AttrDef contains a sequence of attribute
576 * definitions for the NTFS volume. With this, it is supposed to be safe for an
577 * older NTFS driver to mount a volume containing a newer NTFS version without
578 * damaging it (that's the theory. In practice it's: not damaging it too much).
579 * Entries are sorted by attribute type. The flags describe whether the
580 * attribute can be resident/non-resident and possibly other things, but the
581 * actual bits are unknown.
582 */
583typedef struct {
584/*hex ofs*/
585/* 0*/ ntfschar name[0x40]; /* Unicode name of the attribute. Zero
586 terminated. */
587/* 80*/ ATTR_TYPE type; /* Type of the attribute. */
588/* 84*/ le32 display_rule; /* Default display rule.
589 FIXME: What does it mean? (AIA) */
590/* 88*/ COLLATION_RULE collation_rule; /* Default collation rule. */
591/* 8c*/ ATTR_DEF_FLAGS flags; /* Flags describing the attribute. */
592/* 90*/ sle64 min_size; /* Optional minimum attribute size. */
593/* 98*/ sle64 max_size; /* Maximum size of attribute. */
594/* sizeof() = 0xa0 or 160 bytes */
595} __attribute__ ((__packed__)) ATTR_DEF;
596
597/*
598 * Attribute flags (16-bit).
599 */
600enum {
601 ATTR_IS_COMPRESSED = const_cpu_to_le16(0x0001),
602 ATTR_COMPRESSION_MASK = const_cpu_to_le16(0x00ff), /* Compression method
603 mask. Also, first
604 illegal value. */
605 ATTR_IS_ENCRYPTED = const_cpu_to_le16(0x4000),
606 ATTR_IS_SPARSE = const_cpu_to_le16(0x8000),
607} __attribute__ ((__packed__));
608
609typedef le16 ATTR_FLAGS;
610
611/*
612 * Attribute compression.
613 *
614 * Only the data attribute is ever compressed in the current ntfs driver in
615 * Windows. Further, compression is only applied when the data attribute is
616 * non-resident. Finally, to use compression, the maximum allowed cluster size
617 * on a volume is 4kib.
618 *
619 * The compression method is based on independently compressing blocks of X
620 * clusters, where X is determined from the compression_unit value found in the
621 * non-resident attribute record header (more precisely: X = 2^compression_unit
622 * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4).
623 *
624 * There are three different cases of how a compression block of X clusters
625 * can be stored:
626 *
627 * 1) The data in the block is all zero (a sparse block):
628 * This is stored as a sparse block in the runlist, i.e. the runlist
629 * entry has length = X and lcn = -1. The mapping pairs array actually
630 * uses a delta_lcn value length of 0, i.e. delta_lcn is not present at
631 * all, which is then interpreted by the driver as lcn = -1.
632 * NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then
633 * the same principles apply as above, except that the length is not
634 * restricted to being any particular value.
635 *
636 * 2) The data in the block is not compressed:
637 * This happens when compression doesn't reduce the size of the block
638 * in clusters. I.e. if compression has a small effect so that the
639 * compressed data still occupies X clusters, then the uncompressed data
640 * is stored in the block.
641 * This case is recognised by the fact that the runlist entry has
642 * length = X and lcn >= 0. The mapping pairs array stores this as
643 * normal with a run length of X and some specific delta_lcn, i.e.
644 * delta_lcn has to be present.
645 *
646 * 3) The data in the block is compressed:
647 * The common case. This case is recognised by the fact that the run
648 * list entry has length L < X and lcn >= 0. The mapping pairs array
649 * stores this as normal with a run length of X and some specific
650 * delta_lcn, i.e. delta_lcn has to be present. This runlist entry is
651 * immediately followed by a sparse entry with length = X - L and
652 * lcn = -1. The latter entry is to make up the vcn counting to the
653 * full compression block size X.
654 *
655 * In fact, life is more complicated because adjacent entries of the same type
656 * can be coalesced. This means that one has to keep track of the number of
657 * clusters handled and work on a basis of X clusters at a time being one
658 * block. An example: if length L > X this means that this particular runlist
659 * entry contains a block of length X and part of one or more blocks of length
660 * L - X. Another example: if length L < X, this does not necessarily mean that
661 * the block is compressed as it might be that the lcn changes inside the block
662 * and hence the following runlist entry describes the continuation of the
663 * potentially compressed block. The block would be compressed if the
664 * following runlist entry describes at least X - L sparse clusters, thus
665 * making up the compression block length as described in point 3 above. (Of
666 * course, there can be several runlist entries with small lengths so that the
667 * sparse entry does not follow the first data containing entry with
668 * length < X.)
669 *
670 * NOTE: At the end of the compressed attribute value, there most likely is not
671 * just the right amount of data to make up a compression block, thus this data
672 * is not even attempted to be compressed. It is just stored as is, unless
673 * the number of clusters it occupies is reduced when compressed in which case
674 * it is stored as a compressed compression block, complete with sparse
675 * clusters at the end.
676 */
677
678/*
679 * Flags of resident attributes (8-bit).
680 */
681enum {
682 RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index
683 (has implications for deleting and
684 modifying the attribute). */
685} __attribute__ ((__packed__));
686
687typedef u8 RESIDENT_ATTR_FLAGS;
688
689/*
690 * Attribute record header. Always aligned to 8-byte boundary.
691 */
692typedef struct {
693/*Ofs*/
694/* 0*/ ATTR_TYPE type; /* The (32-bit) type of the attribute. */
695/* 4*/ le32 length; /* Byte size of the resident part of the
696 attribute (aligned to 8-byte boundary).
697 Used to get to the next attribute. */
698/* 8*/ u8 non_resident; /* If 0, attribute is resident.
699 If 1, attribute is non-resident. */
700/* 9*/ u8 name_length; /* Unicode character size of name of attribute.
701 0 if unnamed. */
702/* 10*/ le16 name_offset; /* If name_length != 0, the byte offset to the
703 beginning of the name from the attribute
704 record. Note that the name is stored as a
705 Unicode string. When creating, place offset
706 just at the end of the record header. Then,
707 follow with attribute value or mapping pairs
708 array, resident and non-resident attributes
709 respectively, aligning to an 8-byte
710 boundary. */
711/* 12*/ ATTR_FLAGS flags; /* Flags describing the attribute. */
712/* 14*/ le16 instance; /* The instance of this attribute record. This
713 number is unique within this mft record (see
714 MFT_RECORD/next_attribute_instance notes in
715 in mft.h for more details). */
716/* 16*/ union {
717 /* Resident attributes. */
718 struct {
719/* 16 */ le32 value_length;/* Byte size of attribute value. */
720/* 20 */ le16 value_offset;/* Byte offset of the attribute
721 value from the start of the
722 attribute record. When creating,
723 align to 8-byte boundary if we
724 have a name present as this might
725 not have a length of a multiple
726 of 8-bytes. */
727/* 22 */ RESIDENT_ATTR_FLAGS flags; /* See above. */
728/* 23 */ s8 reserved; /* Reserved/alignment to 8-byte
729 boundary. */
730 } __attribute__ ((__packed__)) resident;
731 /* Non-resident attributes. */
732 struct {
733/* 16*/ leVCN lowest_vcn;/* Lowest valid virtual cluster number
734 for this portion of the attribute value or
735 0 if this is the only extent (usually the
736 case). - Only when an attribute list is used
737 does lowest_vcn != 0 ever occur. */
738/* 24*/ leVCN highest_vcn;/* Highest valid vcn of this extent of
739 the attribute value. - Usually there is only one
740 portion, so this usually equals the attribute
741 value size in clusters minus 1. Can be -1 for
742 zero length files. Can be 0 for "single extent"
743 attributes. */
744/* 32*/ le16 mapping_pairs_offset; /* Byte offset from the
745 beginning of the structure to the mapping pairs
746 array which contains the mappings between the
747 vcns and the logical cluster numbers (lcns).
748 When creating, place this at the end of this
749 record header aligned to 8-byte boundary. */
750/* 34*/ u8 compression_unit; /* The compression unit expressed
751 as the log to the base 2 of the number of
752 clusters in a compression unit. 0 means not
753 compressed. (This effectively limits the
754 compression unit size to be a power of two
755 clusters.) WinNT4 only uses a value of 4. */
756/* 35*/ u8 reserved[5]; /* Align to 8-byte boundary. */
757/* The sizes below are only used when lowest_vcn is zero, as otherwise it would
758 be difficult to keep them up-to-date.*/
759/* 40*/ sle64 allocated_size; /* Byte size of disk space
760 allocated to hold the attribute value. Always
761 is a multiple of the cluster size. When a file
762 is compressed, this field is a multiple of the
763 compression block size (2^compression_unit) and
764 it represents the logically allocated space
765 rather than the actual on disk usage. For this
766 use the compressed_size (see below). */
767/* 48*/ sle64 data_size; /* Byte size of the attribute
768 value. Can be larger than allocated_size if
769 attribute value is compressed or sparse. */
770/* 56*/ sle64 initialized_size; /* Byte size of initialized
771 portion of the attribute value. Usually equals
772 data_size. */
773/* sizeof(uncompressed attr) = 64*/
774/* 64*/ sle64 compressed_size; /* Byte size of the attribute
775 value after compression. Only present when
776 compressed. Always is a multiple of the
777 cluster size. Represents the actual amount of
778 disk space being used on the disk. */
779/* sizeof(compressed attr) = 72*/
780 } __attribute__ ((__packed__)) non_resident;
781 } __attribute__ ((__packed__)) data;
782} __attribute__ ((__packed__)) ATTR_RECORD;
783
784typedef ATTR_RECORD ATTR_REC;
785
786/*
787 * File attribute flags (32-bit).
788 */
789enum {
790 /*
791 * The following flags are only present in the STANDARD_INFORMATION
792 * attribute (in the field file_attributes).
793 */
794 FILE_ATTR_READONLY = const_cpu_to_le32(0x00000001),
795 FILE_ATTR_HIDDEN = const_cpu_to_le32(0x00000002),
796 FILE_ATTR_SYSTEM = const_cpu_to_le32(0x00000004),
797 /* Old DOS volid. Unused in NT. = const_cpu_to_le32(0x00000008), */
798
799 FILE_ATTR_DIRECTORY = const_cpu_to_le32(0x00000010),
800 /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is
801 reserved for the DOS SUBDIRECTORY flag. */
802 FILE_ATTR_ARCHIVE = const_cpu_to_le32(0x00000020),
803 FILE_ATTR_DEVICE = const_cpu_to_le32(0x00000040),
804 FILE_ATTR_NORMAL = const_cpu_to_le32(0x00000080),
805
806 FILE_ATTR_TEMPORARY = const_cpu_to_le32(0x00000100),
807 FILE_ATTR_SPARSE_FILE = const_cpu_to_le32(0x00000200),
808 FILE_ATTR_REPARSE_POINT = const_cpu_to_le32(0x00000400),
809 FILE_ATTR_COMPRESSED = const_cpu_to_le32(0x00000800),
810
811 FILE_ATTR_OFFLINE = const_cpu_to_le32(0x00001000),
812 FILE_ATTR_NOT_CONTENT_INDEXED = const_cpu_to_le32(0x00002000),
813 FILE_ATTR_ENCRYPTED = const_cpu_to_le32(0x00004000),
814
815 FILE_ATTR_VALID_FLAGS = const_cpu_to_le32(0x00007fb7),
816 /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
817 FILE_ATTR_DEVICE and preserves everything else. This mask is used
818 to obtain all flags that are valid for reading. */
819 FILE_ATTR_VALID_SET_FLAGS = const_cpu_to_le32(0x000031a7),
820 /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
821 F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
822 F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask
823 is used to to obtain all flags that are valid for setting. */
824
825 /*
826 * The following flags are only present in the FILE_NAME attribute (in
827 * the field file_attributes).
828 */
829 FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = const_cpu_to_le32(0x10000000),
830 /* Note, this is a copy of the corresponding bit from the mft record,
831 telling us whether this is a directory or not, i.e. whether it has
832 an index root attribute or not. */
833 FILE_ATTR_DUP_VIEW_INDEX_PRESENT = const_cpu_to_le32(0x20000000),
834 /* Note, this is a copy of the corresponding bit from the mft record,
835 telling us whether this file has a view index present (eg. object id
836 index, quota index, one of the security indexes or the encrypting
837 file system related indexes). */
838};
839
840typedef le32 FILE_ATTR_FLAGS;
841
842/*
843 * NOTE on times in NTFS: All times are in MS standard time format, i.e. they
844 * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00
845 * universal coordinated time (UTC). (In Linux time starts 1st January 1970,
846 * 00:00:00 UTC and is stored as the number of 1-second intervals since then.)
847 */
848
849/*
850 * Attribute: Standard information (0x10).
851 *
852 * NOTE: Always resident.
853 * NOTE: Present in all base file records on a volume.
854 * NOTE: There is conflicting information about the meaning of each of the time
855 * fields but the meaning as defined below has been verified to be
856 * correct by practical experimentation on Windows NT4 SP6a and is hence
857 * assumed to be the one and only correct interpretation.
858 */
859typedef struct {
860/*Ofs*/
861/* 0*/ sle64 creation_time; /* Time file was created. Updated when
862 a filename is changed(?). */
863/* 8*/ sle64 last_data_change_time; /* Time the data attribute was last
864 modified. */
865/* 16*/ sle64 last_mft_change_time; /* Time this mft record was last
866 modified. */
867/* 24*/ sle64 last_access_time; /* Approximate time when the file was
868 last accessed (obviously this is not
869 updated on read-only volumes). In
870 Windows this is only updated when
871 accessed if some time delta has
872 passed since the last update. Also,
873 last access times updates can be
874 disabled altogether for speed. */
875/* 32*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */
876/* 36*/ union {
877 /* NTFS 1.2 */
878 struct {
879 /* 36*/ u8 reserved12[12]; /* Reserved/alignment to 8-byte
880 boundary. */
881 } __attribute__ ((__packed__)) v1;
882 /* sizeof() = 48 bytes */
883 /* NTFS 3.x */
884 struct {
885/*
886 * If a volume has been upgraded from a previous NTFS version, then these
887 * fields are present only if the file has been accessed since the upgrade.
888 * Recognize the difference by comparing the length of the resident attribute
889 * value. If it is 48, then the following fields are missing. If it is 72 then
890 * the fields are present. Maybe just check like this:
891 * if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) {
892 * Assume NTFS 1.2- format.
893 * If (volume version is 3.x)
894 * Upgrade attribute to NTFS 3.x format.
895 * else
896 * Use NTFS 1.2- format for access.
897 * } else
898 * Use NTFS 3.x format for access.
899 * Only problem is that it might be legal to set the length of the value to
900 * arbitrarily large values thus spoiling this check. - But chkdsk probably
901 * views that as a corruption, assuming that it behaves like this for all
902 * attributes.
903 */
904 /* 36*/ le32 maximum_versions; /* Maximum allowed versions for
905 file. Zero if version numbering is disabled. */
906 /* 40*/ le32 version_number; /* This file's version (if any).
907 Set to zero if maximum_versions is zero. */
908 /* 44*/ le32 class_id; /* Class id from bidirectional
909 class id index (?). */
910 /* 48*/ le32 owner_id; /* Owner_id of the user owning
911 the file. Translate via $Q index in FILE_Extend
912 /$Quota to the quota control entry for the user
913 owning the file. Zero if quotas are disabled. */
914 /* 52*/ le32 security_id; /* Security_id for the file.
915 Translate via $SII index and $SDS data stream
916 in FILE_Secure to the security descriptor. */
917 /* 56*/ le64 quota_charged; /* Byte size of the charge to
918 the quota for all streams of the file. Note: Is
919 zero if quotas are disabled. */
920 /* 64*/ le64 usn; /* Last update sequence number
921 of the file. This is a direct index into the
922 change (aka usn) journal file. It is zero if
923 the usn journal is disabled.
924 NOTE: To disable the journal need to delete
925 the journal file itself and to then walk the
926 whole mft and set all Usn entries in all mft
927 records to zero! (This can take a while!)
928 The journal is FILE_Extend/$UsnJrnl. Win2k
929 will recreate the journal and initiate
930 logging if necessary when mounting the
931 partition. This, in contrast to disabling the
932 journal is a very fast process, so the user
933 won't even notice it. */
934 } __attribute__ ((__packed__)) v3;
935 /* sizeof() = 72 bytes (NTFS 3.x) */
936 } __attribute__ ((__packed__)) ver;
937} __attribute__ ((__packed__)) STANDARD_INFORMATION;
938
939/*
940 * Attribute: Attribute list (0x20).
941 *
942 * - Can be either resident or non-resident.
943 * - Value consists of a sequence of variable length, 8-byte aligned,
944 * ATTR_LIST_ENTRY records.
945 * - The list is not terminated by anything at all! The only way to know when
946 * the end is reached is to keep track of the current offset and compare it to
947 * the attribute value size.
948 * - The attribute list attribute contains one entry for each attribute of
949 * the file in which the list is located, except for the list attribute
950 * itself. The list is sorted: first by attribute type, second by attribute
951 * name (if present), third by instance number. The extents of one
952 * non-resident attribute (if present) immediately follow after the initial
953 * extent. They are ordered by lowest_vcn and have their instace set to zero.
954 * It is not allowed to have two attributes with all sorting keys equal.
955 * - Further restrictions:
956 * - If not resident, the vcn to lcn mapping array has to fit inside the
957 * base mft record.
958 * - The attribute list attribute value has a maximum size of 256kb. This
959 * is imposed by the Windows cache manager.
960 * - Attribute lists are only used when the attributes of mft record do not
961 * fit inside the mft record despite all attributes (that can be made
962 * non-resident) having been made non-resident. This can happen e.g. when:
963 * - File has a large number of hard links (lots of file name
964 * attributes present).
965 * - The mapping pairs array of some non-resident attribute becomes so
966 * large due to fragmentation that it overflows the mft record.
967 * - The security descriptor is very complex (not applicable to
968 * NTFS 3.0 volumes).
969 * - There are many named streams.
970 */
971typedef struct {
972/*Ofs*/
973/* 0*/ ATTR_TYPE type; /* Type of referenced attribute. */
974/* 4*/ le16 length; /* Byte size of this entry (8-byte aligned). */
975/* 6*/ u8 name_length; /* Size in Unicode chars of the name of the
976 attribute or 0 if unnamed. */
977/* 7*/ u8 name_offset; /* Byte offset to beginning of attribute name
978 (always set this to where the name would
979 start even if unnamed). */
980/* 8*/ leVCN lowest_vcn; /* Lowest virtual cluster number of this portion
981 of the attribute value. This is usually 0. It
982 is non-zero for the case where one attribute
983 does not fit into one mft record and thus
984 several mft records are allocated to hold
985 this attribute. In the latter case, each mft
986 record holds one extent of the attribute and
987 there is one attribute list entry for each
988 extent. NOTE: This is DEFINITELY a signed
989 value! The windows driver uses cmp, followed
990 by jg when comparing this, thus it treats it
991 as signed. */
992/* 16*/ leMFT_REF mft_reference;/* The reference of the mft record holding
993 the ATTR_RECORD for this portion of the
994 attribute value. */
995/* 24*/ le16 instance; /* If lowest_vcn = 0, the instance of the
996 attribute being referenced; otherwise 0. */
997/* 26*/ ntfschar name[0]; /* Use when creating only. When reading use
998 name_offset to determine the location of the
999 name. */
1000/* sizeof() = 26 + (attribute_name_length * 2) bytes */
1001} __attribute__ ((__packed__)) ATTR_LIST_ENTRY;
1002
1003/*
1004 * The maximum allowed length for a file name.
1005 */
1006#define MAXIMUM_FILE_NAME_LENGTH 255
1007
1008/*
1009 * Possible namespaces for filenames in ntfs (8-bit).
1010 */
1011enum {
1012 FILE_NAME_POSIX = 0x00,
1013 /* This is the largest namespace. It is case sensitive and allows all
1014 Unicode characters except for: '\0' and '/'. Beware that in
1015 WinNT/2k files which eg have the same name except for their case
1016 will not be distinguished by the standard utilities and thus a "del
1017 filename" will delete both "filename" and "fileName" without
1018 warning. */
1019 FILE_NAME_WIN32 = 0x01,
1020 /* The standard WinNT/2k NTFS long filenames. Case insensitive. All
1021 Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\',
1022 and '|'. Further, names cannot end with a '.' or a space. */
1023 FILE_NAME_DOS = 0x02,
1024 /* The standard DOS filenames (8.3 format). Uppercase only. All 8-bit
1025 characters greater space, except: '"', '*', '+', ',', '/', ':', ';',
1026 '<', '=', '>', '?', and '\'. */
1027 FILE_NAME_WIN32_AND_DOS = 0x03,
1028 /* 3 means that both the Win32 and the DOS filenames are identical and
1029 hence have been saved in this single filename record. */
1030} __attribute__ ((__packed__));
1031
1032typedef u8 FILE_NAME_TYPE_FLAGS;
1033
1034/*
1035 * Attribute: Filename (0x30).
1036 *
1037 * NOTE: Always resident.
1038 * NOTE: All fields, except the parent_directory, are only updated when the
1039 * filename is changed. Until then, they just become out of sync with
1040 * reality and the more up to date values are present in the standard
1041 * information attribute.
1042 * NOTE: There is conflicting information about the meaning of each of the time
1043 * fields but the meaning as defined below has been verified to be
1044 * correct by practical experimentation on Windows NT4 SP6a and is hence
1045 * assumed to be the one and only correct interpretation.
1046 */
1047typedef struct {
1048/*hex ofs*/
1049/* 0*/ leMFT_REF parent_directory; /* Directory this filename is
1050 referenced from. */
1051/* 8*/ sle64 creation_time; /* Time file was created. */
1052/* 10*/ sle64 last_data_change_time; /* Time the data attribute was last
1053 modified. */
1054/* 18*/ sle64 last_mft_change_time; /* Time this mft record was last
1055 modified. */
1056/* 20*/ sle64 last_access_time; /* Time this mft record was last
1057 accessed. */
1058/* 28*/ sle64 allocated_size; /* Byte size of allocated space for the
1059 data attribute. NOTE: Is a multiple
1060 of the cluster size. */
1061/* 30*/ sle64 data_size; /* Byte size of actual data in data
1062 attribute. */
1063/* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */
1064/* 3c*/ union {
1065 /* 3c*/ struct {
1066 /* 3c*/ le16 packed_ea_size; /* Size of the buffer needed to
1067 pack the extended attributes
1068 (EAs), if such are present.*/
1069 /* 3e*/ le16 reserved; /* Reserved for alignment. */
1070 } __attribute__ ((__packed__)) ea;
1071 /* 3c*/ struct {
1072 /* 3c*/ le32 reparse_point_tag; /* Type of reparse point,
1073 present only in reparse
1074 points and only if there are
1075 no EAs. */
1076 } __attribute__ ((__packed__)) rp;
1077 } __attribute__ ((__packed__)) type;
1078/* 40*/ u8 file_name_length; /* Length of file name in
1079 (Unicode) characters. */
1080/* 41*/ FILE_NAME_TYPE_FLAGS file_name_type; /* Namespace of the file name.*/
1081/* 42*/ ntfschar file_name[0]; /* File name in Unicode. */
1082} __attribute__ ((__packed__)) FILE_NAME_ATTR;
1083
1084/*
1085 * GUID structures store globally unique identifiers (GUID). A GUID is a
1086 * 128-bit value consisting of one group of eight hexadecimal digits, followed
1087 * by three groups of four hexadecimal digits each, followed by one group of
1088 * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the
1089 * distributed computing environment (DCE) universally unique identifier (UUID).
1090 * Example of a GUID:
1091 * 1F010768-5A73-BC91-0010A52216A7
1092 */
1093typedef struct {
1094 le32 data1; /* The first eight hexadecimal digits of the GUID. */
1095 le16 data2; /* The first group of four hexadecimal digits. */
1096 le16 data3; /* The second group of four hexadecimal digits. */
1097 u8 data4[8]; /* The first two bytes are the third group of four
1098 hexadecimal digits. The remaining six bytes are the
1099 final 12 hexadecimal digits. */
1100} __attribute__ ((__packed__)) GUID;
1101
1102/*
1103 * FILE_Extend/$ObjId contains an index named $O. This index contains all
1104 * object_ids present on the volume as the index keys and the corresponding
1105 * mft_record numbers as the index entry data parts. The data part (defined
1106 * below) also contains three other object_ids:
1107 * birth_volume_id - object_id of FILE_Volume on which the file was first
1108 * created. Optional (i.e. can be zero).
1109 * birth_object_id - object_id of file when it was first created. Usually
1110 * equals the object_id. Optional (i.e. can be zero).
1111 * domain_id - Reserved (always zero).
1112 */
1113typedef struct {
1114 leMFT_REF mft_reference;/* Mft record containing the object_id in
1115 the index entry key. */
1116 union {
1117 struct {
1118 GUID birth_volume_id;
1119 GUID birth_object_id;
1120 GUID domain_id;
1121 } __attribute__ ((__packed__)) origin;
1122 u8 extended_info[48];
1123 } __attribute__ ((__packed__)) opt;
1124} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA;
1125
1126/*
1127 * Attribute: Object id (NTFS 3.0+) (0x40).
1128 *
1129 * NOTE: Always resident.
1130 */
1131typedef struct {
1132 GUID object_id; /* Unique id assigned to the
1133 file.*/
1134 /* The following fields are optional. The attribute value size is 16
1135 bytes, i.e. sizeof(GUID), if these are not present at all. Note,
1136 the entries can be present but one or more (or all) can be zero
1137 meaning that that particular value(s) is(are) not defined. */
1138 union {
1139 struct {
1140 GUID birth_volume_id; /* Unique id of volume on which
1141 the file was first created.*/
1142 GUID birth_object_id; /* Unique id of file when it was
1143 first created. */
1144 GUID domain_id; /* Reserved, zero. */
1145 } __attribute__ ((__packed__)) origin;
1146 u8 extended_info[48];
1147 } __attribute__ ((__packed__)) opt;
1148} __attribute__ ((__packed__)) OBJECT_ID_ATTR;
1149
1150/*
1151 * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in
1152 * the SID structure (see below).
1153 */
1154//typedef enum { /* SID string prefix. */
1155// SECURITY_NULL_SID_AUTHORITY = {0, 0, 0, 0, 0, 0}, /* S-1-0 */
1156// SECURITY_WORLD_SID_AUTHORITY = {0, 0, 0, 0, 0, 1}, /* S-1-1 */
1157// SECURITY_LOCAL_SID_AUTHORITY = {0, 0, 0, 0, 0, 2}, /* S-1-2 */
1158// SECURITY_CREATOR_SID_AUTHORITY = {0, 0, 0, 0, 0, 3}, /* S-1-3 */
1159// SECURITY_NON_UNIQUE_AUTHORITY = {0, 0, 0, 0, 0, 4}, /* S-1-4 */
1160// SECURITY_NT_SID_AUTHORITY = {0, 0, 0, 0, 0, 5}, /* S-1-5 */
1161//} IDENTIFIER_AUTHORITIES;
1162
1163/*
1164 * These relative identifiers (RIDs) are used with the above identifier
1165 * authorities to make up universal well-known SIDs.
1166 *
1167 * Note: The relative identifier (RID) refers to the portion of a SID, which
1168 * identifies a user or group in relation to the authority that issued the SID.
1169 * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is
1170 * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and
1171 * the relative identifier SECURITY_CREATOR_OWNER_RID (0).
1172 */
1173typedef enum { /* Identifier authority. */
1174 SECURITY_NULL_RID = 0, /* S-1-0 */
1175 SECURITY_WORLD_RID = 0, /* S-1-1 */
1176 SECURITY_LOCAL_RID = 0, /* S-1-2 */
1177
1178 SECURITY_CREATOR_OWNER_RID = 0, /* S-1-3 */
1179 SECURITY_CREATOR_GROUP_RID = 1, /* S-1-3 */
1180
1181 SECURITY_CREATOR_OWNER_SERVER_RID = 2, /* S-1-3 */
1182 SECURITY_CREATOR_GROUP_SERVER_RID = 3, /* S-1-3 */
1183
1184 SECURITY_DIALUP_RID = 1,
1185 SECURITY_NETWORK_RID = 2,
1186 SECURITY_BATCH_RID = 3,
1187 SECURITY_INTERACTIVE_RID = 4,
1188 SECURITY_SERVICE_RID = 6,
1189 SECURITY_ANONYMOUS_LOGON_RID = 7,
1190 SECURITY_PROXY_RID = 8,
1191 SECURITY_ENTERPRISE_CONTROLLERS_RID=9,
1192 SECURITY_SERVER_LOGON_RID = 9,
1193 SECURITY_PRINCIPAL_SELF_RID = 0xa,
1194 SECURITY_AUTHENTICATED_USER_RID = 0xb,
1195 SECURITY_RESTRICTED_CODE_RID = 0xc,
1196 SECURITY_TERMINAL_SERVER_RID = 0xd,
1197
1198 SECURITY_LOGON_IDS_RID = 5,
1199 SECURITY_LOGON_IDS_RID_COUNT = 3,
1200
1201 SECURITY_LOCAL_SYSTEM_RID = 0x12,
1202
1203 SECURITY_NT_NON_UNIQUE = 0x15,
1204
1205 SECURITY_BUILTIN_DOMAIN_RID = 0x20,
1206
1207 /*
1208 * Well-known domain relative sub-authority values (RIDs).
1209 */
1210
1211 /* Users. */
1212 DOMAIN_USER_RID_ADMIN = 0x1f4,
1213 DOMAIN_USER_RID_GUEST = 0x1f5,
1214 DOMAIN_USER_RID_KRBTGT = 0x1f6,
1215
1216 /* Groups. */
1217 DOMAIN_GROUP_RID_ADMINS = 0x200,
1218 DOMAIN_GROUP_RID_USERS = 0x201,
1219 DOMAIN_GROUP_RID_GUESTS = 0x202,
1220 DOMAIN_GROUP_RID_COMPUTERS = 0x203,
1221 DOMAIN_GROUP_RID_CONTROLLERS = 0x204,
1222 DOMAIN_GROUP_RID_CERT_ADMINS = 0x205,
1223 DOMAIN_GROUP_RID_SCHEMA_ADMINS = 0x206,
1224 DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207,
1225 DOMAIN_GROUP_RID_POLICY_ADMINS = 0x208,
1226
1227 /* Aliases. */
1228 DOMAIN_ALIAS_RID_ADMINS = 0x220,
1229 DOMAIN_ALIAS_RID_USERS = 0x221,
1230 DOMAIN_ALIAS_RID_GUESTS = 0x222,
1231 DOMAIN_ALIAS_RID_POWER_USERS = 0x223,
1232
1233 DOMAIN_ALIAS_RID_ACCOUNT_OPS = 0x224,
1234 DOMAIN_ALIAS_RID_SYSTEM_OPS = 0x225,
1235 DOMAIN_ALIAS_RID_PRINT_OPS = 0x226,
1236 DOMAIN_ALIAS_RID_BACKUP_OPS = 0x227,
1237
1238 DOMAIN_ALIAS_RID_REPLICATOR = 0x228,
1239 DOMAIN_ALIAS_RID_RAS_SERVERS = 0x229,
1240 DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a,
1241} RELATIVE_IDENTIFIERS;
1242
1243/*
1244 * The universal well-known SIDs:
1245 *
1246 * NULL_SID S-1-0-0
1247 * WORLD_SID S-1-1-0
1248 * LOCAL_SID S-1-2-0
1249 * CREATOR_OWNER_SID S-1-3-0
1250 * CREATOR_GROUP_SID S-1-3-1
1251 * CREATOR_OWNER_SERVER_SID S-1-3-2
1252 * CREATOR_GROUP_SERVER_SID S-1-3-3
1253 *
1254 * (Non-unique IDs) S-1-4
1255 *
1256 * NT well-known SIDs:
1257 *
1258 * NT_AUTHORITY_SID S-1-5
1259 * DIALUP_SID S-1-5-1
1260 *
1261 * NETWORD_SID S-1-5-2
1262 * BATCH_SID S-1-5-3
1263 * INTERACTIVE_SID S-1-5-4
1264 * SERVICE_SID S-1-5-6
1265 * ANONYMOUS_LOGON_SID S-1-5-7 (aka null logon session)
1266 * PROXY_SID S-1-5-8
1267 * SERVER_LOGON_SID S-1-5-9 (aka domain controller account)
1268 * SELF_SID S-1-5-10 (self RID)
1269 * AUTHENTICATED_USER_SID S-1-5-11
1270 * RESTRICTED_CODE_SID S-1-5-12 (running restricted code)
1271 * TERMINAL_SERVER_SID S-1-5-13 (running on terminal server)
1272 *
1273 * (Logon IDs) S-1-5-5-X-Y
1274 *
1275 * (NT non-unique IDs) S-1-5-0x15-...
1276 *
1277 * (Built-in domain) S-1-5-0x20
1278 */
1279
1280/*
1281 * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure.
1282 *
1283 * NOTE: This is stored as a big endian number, hence the high_part comes
1284 * before the low_part.
1285 */
1286typedef union {
1287 struct {
1288 u16 high_part; /* High 16-bits. */
1289 u32 low_part; /* Low 32-bits. */
1290 } __attribute__ ((__packed__)) parts;
1291 u8 value[6]; /* Value as individual bytes. */
1292} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY;
1293
1294/*
1295 * The SID structure is a variable-length structure used to uniquely identify
1296 * users or groups. SID stands for security identifier.
1297 *
1298 * The standard textual representation of the SID is of the form:
1299 * S-R-I-S-S...
1300 * Where:
1301 * - The first "S" is the literal character 'S' identifying the following
1302 * digits as a SID.
1303 * - R is the revision level of the SID expressed as a sequence of digits
1304 * either in decimal or hexadecimal (if the later, prefixed by "0x").
1305 * - I is the 48-bit identifier_authority, expressed as digits as R above.
1306 * - S... is one or more sub_authority values, expressed as digits as above.
1307 *
1308 * Example SID; the domain-relative SID of the local Administrators group on
1309 * Windows NT/2k:
1310 * S-1-5-32-544
1311 * This translates to a SID with:
1312 * revision = 1,
1313 * sub_authority_count = 2,
1314 * identifier_authority = {0,0,0,0,0,5}, // SECURITY_NT_AUTHORITY
1315 * sub_authority[0] = 32, // SECURITY_BUILTIN_DOMAIN_RID
1316 * sub_authority[1] = 544 // DOMAIN_ALIAS_RID_ADMINS
1317 */
1318typedef struct {
1319 u8 revision;
1320 u8 sub_authority_count;
1321 SID_IDENTIFIER_AUTHORITY identifier_authority;
1322 le32 sub_authority[1]; /* At least one sub_authority. */
1323} __attribute__ ((__packed__)) SID;
1324
1325/*
1326 * Current constants for SIDs.
1327 */
1328typedef enum {
1329 SID_REVISION = 1, /* Current revision level. */
1330 SID_MAX_SUB_AUTHORITIES = 15, /* Maximum number of those. */
1331 SID_RECOMMENDED_SUB_AUTHORITIES = 1, /* Will change to around 6 in
1332 a future revision. */
1333} SID_CONSTANTS;
1334
1335/*
1336 * The predefined ACE types (8-bit, see below).
1337 */
1338enum {
1339 ACCESS_MIN_MS_ACE_TYPE = 0,
1340 ACCESS_ALLOWED_ACE_TYPE = 0,
1341 ACCESS_DENIED_ACE_TYPE = 1,
1342 SYSTEM_AUDIT_ACE_TYPE = 2,
1343 SYSTEM_ALARM_ACE_TYPE = 3, /* Not implemented as of Win2k. */
1344 ACCESS_MAX_MS_V2_ACE_TYPE = 3,
1345
1346 ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4,
1347 ACCESS_MAX_MS_V3_ACE_TYPE = 4,
1348
1349 /* The following are Win2k only. */
1350 ACCESS_MIN_MS_OBJECT_ACE_TYPE = 5,
1351 ACCESS_ALLOWED_OBJECT_ACE_TYPE = 5,
1352 ACCESS_DENIED_OBJECT_ACE_TYPE = 6,
1353 SYSTEM_AUDIT_OBJECT_ACE_TYPE = 7,
1354 SYSTEM_ALARM_OBJECT_ACE_TYPE = 8,
1355 ACCESS_MAX_MS_OBJECT_ACE_TYPE = 8,
1356
1357 ACCESS_MAX_MS_V4_ACE_TYPE = 8,
1358
1359 /* This one is for WinNT/2k. */
1360 ACCESS_MAX_MS_ACE_TYPE = 8,
1361} __attribute__ ((__packed__));
1362
1363typedef u8 ACE_TYPES;
1364
1365/*
1366 * The ACE flags (8-bit) for audit and inheritance (see below).
1367 *
1368 * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE
1369 * types to indicate that a message is generated (in Windows!) for successful
1370 * accesses.
1371 *
1372 * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types
1373 * to indicate that a message is generated (in Windows!) for failed accesses.
1374 */
1375enum {
1376 /* The inheritance flags. */
1377 OBJECT_INHERIT_ACE = 0x01,
1378 CONTAINER_INHERIT_ACE = 0x02,
1379 NO_PROPAGATE_INHERIT_ACE = 0x04,
1380 INHERIT_ONLY_ACE = 0x08,
1381 INHERITED_ACE = 0x10, /* Win2k only. */
1382 VALID_INHERIT_FLAGS = 0x1f,
1383
1384 /* The audit flags. */
1385 SUCCESSFUL_ACCESS_ACE_FLAG = 0x40,
1386 FAILED_ACCESS_ACE_FLAG = 0x80,
1387} __attribute__ ((__packed__));
1388
1389typedef u8 ACE_FLAGS;
1390
1391/*
1392 * An ACE is an access-control entry in an access-control list (ACL).
1393 * An ACE defines access to an object for a specific user or group or defines
1394 * the types of access that generate system-administration messages or alarms
1395 * for a specific user or group. The user or group is identified by a security
1396 * identifier (SID).
1397 *
1398 * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary),
1399 * which specifies the type and size of the ACE. The format of the subsequent
1400 * data depends on the ACE type.
1401 */
1402typedef struct {
1403/*Ofs*/
1404/* 0*/ ACE_TYPES type; /* Type of the ACE. */
1405/* 1*/ ACE_FLAGS flags; /* Flags describing the ACE. */
1406/* 2*/ le16 size; /* Size in bytes of the ACE. */
1407} __attribute__ ((__packed__)) ACE_HEADER;
1408
1409/*
1410 * The access mask (32-bit). Defines the access rights.
1411 *
1412 * The specific rights (bits 0 to 15). These depend on the type of the object
1413 * being secured by the ACE.
1414 */
1415enum {
1416 /* Specific rights for files and directories are as follows: */
1417
1418 /* Right to read data from the file. (FILE) */
1419 FILE_READ_DATA = const_cpu_to_le32(0x00000001),
1420 /* Right to list contents of a directory. (DIRECTORY) */
1421 FILE_LIST_DIRECTORY = const_cpu_to_le32(0x00000001),
1422
1423 /* Right to write data to the file. (FILE) */
1424 FILE_WRITE_DATA = const_cpu_to_le32(0x00000002),
1425 /* Right to create a file in the directory. (DIRECTORY) */
1426 FILE_ADD_FILE = const_cpu_to_le32(0x00000002),
1427
1428 /* Right to append data to the file. (FILE) */
1429 FILE_APPEND_DATA = const_cpu_to_le32(0x00000004),
1430 /* Right to create a subdirectory. (DIRECTORY) */
1431 FILE_ADD_SUBDIRECTORY = const_cpu_to_le32(0x00000004),
1432
1433 /* Right to read extended attributes. (FILE/DIRECTORY) */
1434 FILE_READ_EA = const_cpu_to_le32(0x00000008),
1435
1436 /* Right to write extended attributes. (FILE/DIRECTORY) */
1437 FILE_WRITE_EA = const_cpu_to_le32(0x00000010),
1438
1439 /* Right to execute a file. (FILE) */
1440 FILE_EXECUTE = const_cpu_to_le32(0x00000020),
1441 /* Right to traverse the directory. (DIRECTORY) */
1442 FILE_TRAVERSE = const_cpu_to_le32(0x00000020),
1443
1444 /*
1445 * Right to delete a directory and all the files it contains (its
1446 * children), even if the files are read-only. (DIRECTORY)
1447 */
1448 FILE_DELETE_CHILD = const_cpu_to_le32(0x00000040),
1449
1450 /* Right to read file attributes. (FILE/DIRECTORY) */
1451 FILE_READ_ATTRIBUTES = const_cpu_to_le32(0x00000080),
1452
1453 /* Right to change file attributes. (FILE/DIRECTORY) */
1454 FILE_WRITE_ATTRIBUTES = const_cpu_to_le32(0x00000100),
1455
1456 /*
1457 * The standard rights (bits 16 to 23). These are independent of the
1458 * type of object being secured.
1459 */
1460
1461 /* Right to delete the object. */
1462 DELETE = const_cpu_to_le32(0x00010000),
1463
1464 /*
1465 * Right to read the information in the object's security descriptor,
1466 * not including the information in the SACL, i.e. right to read the
1467 * security descriptor and owner.
1468 */
1469 READ_CONTROL = const_cpu_to_le32(0x00020000),
1470
1471 /* Right to modify the DACL in the object's security descriptor. */
1472 WRITE_DAC = const_cpu_to_le32(0x00040000),
1473
1474 /* Right to change the owner in the object's security descriptor. */
1475 WRITE_OWNER = const_cpu_to_le32(0x00080000),
1476
1477 /*
1478 * Right to use the object for synchronization. Enables a process to
1479 * wait until the object is in the signalled state. Some object types
1480 * do not support this access right.
1481 */
1482 SYNCHRONIZE = const_cpu_to_le32(0x00100000),
1483
1484 /*
1485 * The following STANDARD_RIGHTS_* are combinations of the above for
1486 * convenience and are defined by the Win32 API.
1487 */
1488
1489 /* These are currently defined to READ_CONTROL. */
1490 STANDARD_RIGHTS_READ = const_cpu_to_le32(0x00020000),
1491 STANDARD_RIGHTS_WRITE = const_cpu_to_le32(0x00020000),
1492 STANDARD_RIGHTS_EXECUTE = const_cpu_to_le32(0x00020000),
1493
1494 /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
1495 STANDARD_RIGHTS_REQUIRED = const_cpu_to_le32(0x000f0000),
1496
1497 /*
1498 * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
1499 * SYNCHRONIZE access.
1500 */
1501 STANDARD_RIGHTS_ALL = const_cpu_to_le32(0x001f0000),
1502
1503 /*
1504 * The access system ACL and maximum allowed access types (bits 24 to
1505 * 25, bits 26 to 27 are reserved).
1506 */
1507 ACCESS_SYSTEM_SECURITY = const_cpu_to_le32(0x01000000),
1508 MAXIMUM_ALLOWED = const_cpu_to_le32(0x02000000),
1509
1510 /*
1511 * The generic rights (bits 28 to 31). These map onto the standard and
1512 * specific rights.
1513 */
1514
1515 /* Read, write, and execute access. */
1516 GENERIC_ALL = const_cpu_to_le32(0x10000000),
1517
1518 /* Execute access. */
1519 GENERIC_EXECUTE = const_cpu_to_le32(0x20000000),
1520
1521 /*
1522 * Write access. For files, this maps onto:
1523 * FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA |
1524 * FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE
1525 * For directories, the mapping has the same numerical value. See
1526 * above for the descriptions of the rights granted.
1527 */
1528 GENERIC_WRITE = const_cpu_to_le32(0x40000000),
1529
1530 /*
1531 * Read access. For files, this maps onto:
1532 * FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA |
1533 * STANDARD_RIGHTS_READ | SYNCHRONIZE
1534 * For directories, the mapping has the same numberical value. See
1535 * above for the descriptions of the rights granted.
1536 */
1537 GENERIC_READ = const_cpu_to_le32(0x80000000),
1538};
1539
1540typedef le32 ACCESS_MASK;
1541
1542/*
1543 * The generic mapping array. Used to denote the mapping of each generic
1544 * access right to a specific access mask.
1545 *
1546 * FIXME: What exactly is this and what is it for? (AIA)
1547 */
1548typedef struct {
1549 ACCESS_MASK generic_read;
1550 ACCESS_MASK generic_write;
1551 ACCESS_MASK generic_execute;
1552 ACCESS_MASK generic_all;
1553} __attribute__ ((__packed__)) GENERIC_MAPPING;
1554
1555/*
1556 * The predefined ACE type structures are as defined below.
1557 */
1558
1559/*
1560 * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE
1561 */
1562typedef struct {
1563/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
1564 ACE_TYPES type; /* Type of the ACE. */
1565 ACE_FLAGS flags; /* Flags describing the ACE. */
1566 le16 size; /* Size in bytes of the ACE. */
1567/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */
1568
1569/* 8*/ SID sid; /* The SID associated with the ACE. */
1570} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE,
1571 SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE;
1572
1573/*
1574 * The object ACE flags (32-bit).
1575 */
1576enum {
1577 ACE_OBJECT_TYPE_PRESENT = const_cpu_to_le32(1),
1578 ACE_INHERITED_OBJECT_TYPE_PRESENT = const_cpu_to_le32(2),
1579};
1580
1581typedef le32 OBJECT_ACE_FLAGS;
1582
1583typedef struct {
1584/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
1585 ACE_TYPES type; /* Type of the ACE. */
1586 ACE_FLAGS flags; /* Flags describing the ACE. */
1587 le16 size; /* Size in bytes of the ACE. */
1588/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */
1589
1590/* 8*/ OBJECT_ACE_FLAGS object_flags; /* Flags describing the object ACE. */
1591/* 12*/ GUID object_type;
1592/* 28*/ GUID inherited_object_type;
1593
1594/* 44*/ SID sid; /* The SID associated with the ACE. */
1595} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE,
1596 ACCESS_DENIED_OBJECT_ACE,
1597 SYSTEM_AUDIT_OBJECT_ACE,
1598 SYSTEM_ALARM_OBJECT_ACE;
1599
1600/*
1601 * An ACL is an access-control list (ACL).
1602 * An ACL starts with an ACL header structure, which specifies the size of
1603 * the ACL and the number of ACEs it contains. The ACL header is followed by
1604 * zero or more access control entries (ACEs). The ACL as well as each ACE
1605 * are aligned on 4-byte boundaries.
1606 */
1607typedef struct {
1608 u8 revision; /* Revision of this ACL. */
1609 u8 alignment1;
1610 le16 size; /* Allocated space in bytes for ACL. Includes this
1611 header, the ACEs and the remaining free space. */
1612 le16 ace_count; /* Number of ACEs in the ACL. */
1613 le16 alignment2;
1614/* sizeof() = 8 bytes */
1615} __attribute__ ((__packed__)) ACL;
1616
1617/*
1618 * Current constants for ACLs.
1619 */
1620typedef enum {
1621 /* Current revision. */
1622 ACL_REVISION = 2,
1623 ACL_REVISION_DS = 4,
1624
1625 /* History of revisions. */
1626 ACL_REVISION1 = 1,
1627 MIN_ACL_REVISION = 2,
1628 ACL_REVISION2 = 2,
1629 ACL_REVISION3 = 3,
1630 ACL_REVISION4 = 4,
1631 MAX_ACL_REVISION = 4,
1632} ACL_CONSTANTS;
1633
1634/*
1635 * The security descriptor control flags (16-bit).
1636 *
1637 * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID
1638 * pointed to by the Owner field was provided by a defaulting mechanism
1639 * rather than explicitly provided by the original provider of the
1640 * security descriptor. This may affect the treatment of the SID with
1641 * respect to inheritence of an owner.
1642 *
1643 * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in
1644 * the Group field was provided by a defaulting mechanism rather than
1645 * explicitly provided by the original provider of the security
1646 * descriptor. This may affect the treatment of the SID with respect to
1647 * inheritence of a primary group.
1648 *
1649 * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security
1650 * descriptor contains a discretionary ACL. If this flag is set and the
1651 * Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is
1652 * explicitly being specified.
1653 *
1654 * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
1655 * pointed to by the Dacl field was provided by a defaulting mechanism
1656 * rather than explicitly provided by the original provider of the
1657 * security descriptor. This may affect the treatment of the ACL with
1658 * respect to inheritence of an ACL. This flag is ignored if the
1659 * DaclPresent flag is not set.
1660 *
1661 * SE_SACL_PRESENT - This boolean flag, when set, indicates that the security
1662 * descriptor contains a system ACL pointed to by the Sacl field. If this
1663 * flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then
1664 * an empty (but present) ACL is being specified.
1665 *
1666 * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
1667 * pointed to by the Sacl field was provided by a defaulting mechanism
1668 * rather than explicitly provided by the original provider of the
1669 * security descriptor. This may affect the treatment of the ACL with
1670 * respect to inheritence of an ACL. This flag is ignored if the
1671 * SaclPresent flag is not set.
1672 *
1673 * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security
1674 * descriptor is in self-relative form. In this form, all fields of the
1675 * security descriptor are contiguous in memory and all pointer fields are
1676 * expressed as offsets from the beginning of the security descriptor.
1677 */
1678enum {
1679 SE_OWNER_DEFAULTED = const_cpu_to_le16(0x0001),
1680 SE_GROUP_DEFAULTED = const_cpu_to_le16(0x0002),
1681 SE_DACL_PRESENT = const_cpu_to_le16(0x0004),
1682 SE_DACL_DEFAULTED = const_cpu_to_le16(0x0008),
1683
1684 SE_SACL_PRESENT = const_cpu_to_le16(0x0010),
1685 SE_SACL_DEFAULTED = const_cpu_to_le16(0x0020),
1686
1687 SE_DACL_AUTO_INHERIT_REQ = const_cpu_to_le16(0x0100),
1688 SE_SACL_AUTO_INHERIT_REQ = const_cpu_to_le16(0x0200),
1689 SE_DACL_AUTO_INHERITED = const_cpu_to_le16(0x0400),
1690 SE_SACL_AUTO_INHERITED = const_cpu_to_le16(0x0800),
1691
1692 SE_DACL_PROTECTED = const_cpu_to_le16(0x1000),
1693 SE_SACL_PROTECTED = const_cpu_to_le16(0x2000),
1694 SE_RM_CONTROL_VALID = const_cpu_to_le16(0x4000),
1695 SE_SELF_RELATIVE = const_cpu_to_le16(0x8000)
1696} __attribute__ ((__packed__));
1697
1698typedef le16 SECURITY_DESCRIPTOR_CONTROL;
1699
1700/*
1701 * Self-relative security descriptor. Contains the owner and group SIDs as well
1702 * as the sacl and dacl ACLs inside the security descriptor itself.
1703 */
1704typedef struct {
1705 u8 revision; /* Revision level of the security descriptor. */
1706 u8 alignment;
1707 SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of
1708 the descriptor as well as the following fields. */
1709 le32 owner; /* Byte offset to a SID representing an object's
1710 owner. If this is NULL, no owner SID is present in
1711 the descriptor. */
1712 le32 group; /* Byte offset to a SID representing an object's
1713 primary group. If this is NULL, no primary group
1714 SID is present in the descriptor. */
1715 le32 sacl; /* Byte offset to a system ACL. Only valid, if
1716 SE_SACL_PRESENT is set in the control field. If
1717 SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
1718 is specified. */
1719 le32 dacl; /* Byte offset to a discretionary ACL. Only valid, if
1720 SE_DACL_PRESENT is set in the control field. If
1721 SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
1722 (unconditionally granting access) is specified. */
1723/* sizeof() = 0x14 bytes */
1724} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE;
1725
1726/*
1727 * Absolute security descriptor. Does not contain the owner and group SIDs, nor
1728 * the sacl and dacl ACLs inside the security descriptor. Instead, it contains
1729 * pointers to these structures in memory. Obviously, absolute security
1730 * descriptors are only useful for in memory representations of security
1731 * descriptors. On disk, a self-relative security descriptor is used.
1732 */
1733typedef struct {
1734 u8 revision; /* Revision level of the security descriptor. */
1735 u8 alignment;
1736 SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of
1737 the descriptor as well as the following fields. */
1738 SID *owner; /* Points to a SID representing an object's owner. If
1739 this is NULL, no owner SID is present in the
1740 descriptor. */
1741 SID *group; /* Points to a SID representing an object's primary
1742 group. If this is NULL, no primary group SID is
1743 present in the descriptor. */
1744 ACL *sacl; /* Points to a system ACL. Only valid, if
1745 SE_SACL_PRESENT is set in the control field. If
1746 SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
1747 is specified. */
1748 ACL *dacl; /* Points to a discretionary ACL. Only valid, if
1749 SE_DACL_PRESENT is set in the control field. If
1750 SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
1751 (unconditionally granting access) is specified. */
1752} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR;
1753
1754/*
1755 * Current constants for security descriptors.
1756 */
1757typedef enum {
1758 /* Current revision. */
1759 SECURITY_DESCRIPTOR_REVISION = 1,
1760 SECURITY_DESCRIPTOR_REVISION1 = 1,
1761
1762 /* The sizes of both the absolute and relative security descriptors is
1763 the same as pointers, at least on ia32 architecture are 32-bit. */
1764 SECURITY_DESCRIPTOR_MIN_LENGTH = sizeof(SECURITY_DESCRIPTOR),
1765} SECURITY_DESCRIPTOR_CONSTANTS;
1766
1767/*
1768 * Attribute: Security descriptor (0x50). A standard self-relative security
1769 * descriptor.
1770 *
1771 * NOTE: Can be resident or non-resident.
1772 * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally
1773 * in FILE_Secure and the correct descriptor is found using the security_id
1774 * from the standard information attribute.
1775 */
1776typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR;
1777
1778/*
1779 * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one
1780 * referenced instance of each unique security descriptor is stored.
1781 *
1782 * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It
1783 * does, however, contain two indexes ($SDH and $SII) as well as a named data
1784 * stream ($SDS).
1785 *
1786 * Every unique security descriptor is assigned a unique security identifier
1787 * (security_id, not to be confused with a SID). The security_id is unique for
1788 * the NTFS volume and is used as an index into the $SII index, which maps
1789 * security_ids to the security descriptor's storage location within the $SDS
1790 * data attribute. The $SII index is sorted by ascending security_id.
1791 *
1792 * A simple hash is computed from each security descriptor. This hash is used
1793 * as an index into the $SDH index, which maps security descriptor hashes to
1794 * the security descriptor's storage location within the $SDS data attribute.
1795 * The $SDH index is sorted by security descriptor hash and is stored in a B+
1796 * tree. When searching $SDH (with the intent of determining whether or not a
1797 * new security descriptor is already present in the $SDS data stream), if a
1798 * matching hash is found, but the security descriptors do not match, the
1799 * search in the $SDH index is continued, searching for a next matching hash.
1800 *
1801 * When a precise match is found, the security_id coresponding to the security
1802 * descriptor in the $SDS attribute is read from the found $SDH index entry and
1803 * is stored in the $STANDARD_INFORMATION attribute of the file/directory to
1804 * which the security descriptor is being applied. The $STANDARD_INFORMATION
1805 * attribute is present in all base mft records (i.e. in all files and
1806 * directories).
1807 *
1808 * If a match is not found, the security descriptor is assigned a new unique
1809 * security_id and is added to the $SDS data attribute. Then, entries
1810 * referencing the this security descriptor in the $SDS data attribute are
1811 * added to the $SDH and $SII indexes.
1812 *
1813 * Note: Entries are never deleted from FILE_Secure, even if nothing
1814 * references an entry any more.
1815 */
1816
1817/*
1818 * This header precedes each security descriptor in the $SDS data stream.
1819 * This is also the index entry data part of both the $SII and $SDH indexes.
1820 */
1821typedef struct {
1822 le32 hash; /* Hash of the security descriptor. */
1823 le32 security_id; /* The security_id assigned to the descriptor. */
1824 le64 offset; /* Byte offset of this entry in the $SDS stream. */
1825 le32 length; /* Size in bytes of this entry in $SDS stream. */
1826} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER;
1827
1828/*
1829 * The $SDS data stream contains the security descriptors, aligned on 16-byte
1830 * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot
1831 * cross 256kib boundaries (this restriction is imposed by the Windows cache
1832 * manager). Each security descriptor is contained in a SDS_ENTRY structure.
1833 * Also, each security descriptor is stored twice in the $SDS stream with a
1834 * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size)
1835 * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the
1836 * the first copy of the security descriptor will be at offset 0x51d0 in the
1837 * $SDS data stream and the second copy will be at offset 0x451d0.
1838 */
1839typedef struct {
1840/*Ofs*/
1841/* 0 SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like
1842 unnamed structs. */
1843 le32 hash; /* Hash of the security descriptor. */
1844 le32 security_id; /* The security_id assigned to the descriptor. */
1845 le64 offset; /* Byte offset of this entry in the $SDS stream. */
1846 le32 length; /* Size in bytes of this entry in $SDS stream. */
1847/* 20*/ SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security
1848 descriptor. */
1849} __attribute__ ((__packed__)) SDS_ENTRY;
1850
1851/*
1852 * The index entry key used in the $SII index. The collation type is
1853 * COLLATION_NTOFS_ULONG.
1854 */
1855typedef struct {
1856 le32 security_id; /* The security_id assigned to the descriptor. */
1857} __attribute__ ((__packed__)) SII_INDEX_KEY;
1858
1859/*
1860 * The index entry key used in the $SDH index. The keys are sorted first by
1861 * hash and then by security_id. The collation rule is
1862 * COLLATION_NTOFS_SECURITY_HASH.
1863 */
1864typedef struct {
1865 le32 hash; /* Hash of the security descriptor. */
1866 le32 security_id; /* The security_id assigned to the descriptor. */
1867} __attribute__ ((__packed__)) SDH_INDEX_KEY;
1868
1869/*
1870 * Attribute: Volume name (0x60).
1871 *
1872 * NOTE: Always resident.
1873 * NOTE: Present only in FILE_Volume.
1874 */
1875typedef struct {
1876 ntfschar name[0]; /* The name of the volume in Unicode. */
1877} __attribute__ ((__packed__)) VOLUME_NAME;
1878
1879/*
1880 * Possible flags for the volume (16-bit).
1881 */
1882enum {
1883 VOLUME_IS_DIRTY = const_cpu_to_le16(0x0001),
1884 VOLUME_RESIZE_LOG_FILE = const_cpu_to_le16(0x0002),
1885 VOLUME_UPGRADE_ON_MOUNT = const_cpu_to_le16(0x0004),
1886 VOLUME_MOUNTED_ON_NT4 = const_cpu_to_le16(0x0008),
1887
1888 VOLUME_DELETE_USN_UNDERWAY = const_cpu_to_le16(0x0010),
1889 VOLUME_REPAIR_OBJECT_ID = const_cpu_to_le16(0x0020),
1890
1891 VOLUME_MODIFIED_BY_CHKDSK = const_cpu_to_le16(0x8000),
1892
1893 VOLUME_FLAGS_MASK = const_cpu_to_le16(0x803f),
1894
1895 /* To make our life easier when checking if we must mount read-only. */
1896 VOLUME_MUST_MOUNT_RO_MASK = const_cpu_to_le16(0x8037),
1897} __attribute__ ((__packed__));
1898
1899typedef le16 VOLUME_FLAGS;
1900
1901/*
1902 * Attribute: Volume information (0x70).
1903 *
1904 * NOTE: Always resident.
1905 * NOTE: Present only in FILE_Volume.
1906 * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses
1907 * NTFS 1.2. I haven't personally seen other values yet.
1908 */
1909typedef struct {
1910 le64 reserved; /* Not used (yet?). */
1911 u8 major_ver; /* Major version of the ntfs format. */
1912 u8 minor_ver; /* Minor version of the ntfs format. */
1913 VOLUME_FLAGS flags; /* Bit array of VOLUME_* flags. */
1914} __attribute__ ((__packed__)) VOLUME_INFORMATION;
1915
1916/*
1917 * Attribute: Data attribute (0x80).
1918 *
1919 * NOTE: Can be resident or non-resident.
1920 *
1921 * Data contents of a file (i.e. the unnamed stream) or of a named stream.
1922 */
1923typedef struct {
1924 u8 data[0]; /* The file's data contents. */
1925} __attribute__ ((__packed__)) DATA_ATTR;
1926
1927/*
1928 * Index header flags (8-bit).
1929 */
1930enum {
1931 /*
1932 * When index header is in an index root attribute:
1933 */
1934 SMALL_INDEX = 0, /* The index is small enough to fit inside the index
1935 root attribute and there is no index allocation
1936 attribute present. */
1937 LARGE_INDEX = 1, /* The index is too large to fit in the index root
1938 attribute and/or an index allocation attribute is
1939 present. */
1940 /*
1941 * When index header is in an index block, i.e. is part of index
1942 * allocation attribute:
1943 */
1944 LEAF_NODE = 0, /* This is a leaf node, i.e. there are no more nodes
1945 branching off it. */
1946 INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf
1947 node. */
1948 NODE_MASK = 1, /* Mask for accessing the *_NODE bits. */
1949} __attribute__ ((__packed__));
1950
1951typedef u8 INDEX_HEADER_FLAGS;
1952
1953/*
1954 * This is the header for indexes, describing the INDEX_ENTRY records, which
1955 * follow the INDEX_HEADER. Together the index header and the index entries
1956 * make up a complete index.
1957 *
1958 * IMPORTANT NOTE: The offset, length and size structure members are counted
1959 * relative to the start of the index header structure and not relative to the
1960 * start of the index root or index allocation structures themselves.
1961 */
1962typedef struct {
1963 le32 entries_offset; /* Byte offset to first INDEX_ENTRY
1964 aligned to 8-byte boundary. */
1965 le32 index_length; /* Data size of the index in bytes,
1966 i.e. bytes used from allocated
1967 size, aligned to 8-byte boundary. */
1968 le32 allocated_size; /* Byte size of this index (block),
1969 multiple of 8 bytes. */
1970 /* NOTE: For the index root attribute, the above two numbers are always
1971 equal, as the attribute is resident and it is resized as needed. In
1972 the case of the index allocation attribute the attribute is not
1973 resident and hence the allocated_size is a fixed value and must
1974 equal the index_block_size specified by the INDEX_ROOT attribute
1975 corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK
1976 belongs to. */
1977 INDEX_HEADER_FLAGS flags; /* Bit field of INDEX_HEADER_FLAGS. */
1978 u8 reserved[3]; /* Reserved/align to 8-byte boundary. */
1979} __attribute__ ((__packed__)) INDEX_HEADER;
1980
1981/*
1982 * Attribute: Index root (0x90).
1983 *
1984 * NOTE: Always resident.
1985 *
1986 * This is followed by a sequence of index entries (INDEX_ENTRY structures)
1987 * as described by the index header.
1988 *
1989 * When a directory is small enough to fit inside the index root then this
1990 * is the only attribute describing the directory. When the directory is too
1991 * large to fit in the index root, on the other hand, two aditional attributes
1992 * are present: an index allocation attribute, containing sub-nodes of the B+
1993 * directory tree (see below), and a bitmap attribute, describing which virtual
1994 * cluster numbers (vcns) in the index allocation attribute are in use by an
1995 * index block.
1996 *
1997 * NOTE: The root directory (FILE_root) contains an entry for itself. Other
1998 * dircetories do not contain entries for themselves, though.
1999 */
2000typedef struct {
2001 ATTR_TYPE type; /* Type of the indexed attribute. Is
2002 $FILE_NAME for directories, zero
2003 for view indexes. No other values
2004 allowed. */
2005 COLLATION_RULE collation_rule; /* Collation rule used to sort the
2006 index entries. If type is $FILE_NAME,
2007 this must be COLLATION_FILE_NAME. */
2008 le32 index_block_size; /* Size of each index block in bytes (in
2009 the index allocation attribute). */
2010 u8 clusters_per_index_block; /* Cluster size of each index block (in
2011 the index allocation attribute), when
2012 an index block is >= than a cluster,
2013 otherwise this will be the log of
2014 the size (like how the encoding of
2015 the mft record size and the index
2016 record size found in the boot sector
2017 work). Has to be a power of 2. */
2018 u8 reserved[3]; /* Reserved/align to 8-byte boundary. */
2019 INDEX_HEADER index; /* Index header describing the
2020 following index entries. */
2021} __attribute__ ((__packed__)) INDEX_ROOT;
2022
2023/*
2024 * Attribute: Index allocation (0xa0).
2025 *
2026 * NOTE: Always non-resident (doesn't make sense to be resident anyway!).
2027 *
2028 * This is an array of index blocks. Each index block starts with an
2029 * INDEX_BLOCK structure containing an index header, followed by a sequence of
2030 * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER.
2031 */
2032typedef struct {
2033/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
2034 NTFS_RECORD_TYPE magic; /* Magic is "INDX". */
2035 le16 usa_ofs; /* See NTFS_RECORD definition. */
2036 le16 usa_count; /* See NTFS_RECORD definition. */
2037
2038/* 8*/ sle64 lsn; /* $LogFile sequence number of the last
2039 modification of this index block. */
2040/* 16*/ leVCN index_block_vcn; /* Virtual cluster number of the index block.
2041 If the cluster_size on the volume is <= the
2042 index_block_size of the directory,
2043 index_block_vcn counts in units of clusters,
2044 and in units of sectors otherwise. */
2045/* 24*/ INDEX_HEADER index; /* Describes the following index entries. */
2046/* sizeof()= 40 (0x28) bytes */
2047/*
2048 * When creating the index block, we place the update sequence array at this
2049 * offset, i.e. before we start with the index entries. This also makes sense,
2050 * otherwise we could run into problems with the update sequence array
2051 * containing in itself the last two bytes of a sector which would mean that
2052 * multi sector transfer protection wouldn't work. As you can't protect data
2053 * by overwriting it since you then can't get it back...
2054 * When reading use the data from the ntfs record header.
2055 */
2056} __attribute__ ((__packed__)) INDEX_BLOCK;
2057
2058typedef INDEX_BLOCK INDEX_ALLOCATION;
2059
2060/*
2061 * The system file FILE_Extend/$Reparse contains an index named $R listing
2062 * all reparse points on the volume. The index entry keys are as defined
2063 * below. Note, that there is no index data associated with the index entries.
2064 *
2065 * The index entries are sorted by the index key file_id. The collation rule is
2066 * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the
2067 * primary key / is not a key at all. (AIA)
2068 */
2069typedef struct {
2070 le32 reparse_tag; /* Reparse point type (inc. flags). */
2071 leMFT_REF file_id; /* Mft record of the file containing the
2072 reparse point attribute. */
2073} __attribute__ ((__packed__)) REPARSE_INDEX_KEY;
2074
2075/*
2076 * Quota flags (32-bit).
2077 *
2078 * The user quota flags. Names explain meaning.
2079 */
2080enum {
2081 QUOTA_FLAG_DEFAULT_LIMITS = const_cpu_to_le32(0x00000001),
2082 QUOTA_FLAG_LIMIT_REACHED = const_cpu_to_le32(0x00000002),
2083 QUOTA_FLAG_ID_DELETED = const_cpu_to_le32(0x00000004),
2084
2085 QUOTA_FLAG_USER_MASK = const_cpu_to_le32(0x00000007),
2086 /* This is a bit mask for the user quota flags. */
2087
2088 /*
2089 * These flags are only present in the quota defaults index entry, i.e.
2090 * in the entry where owner_id = QUOTA_DEFAULTS_ID.
2091 */
2092 QUOTA_FLAG_TRACKING_ENABLED = const_cpu_to_le32(0x00000010),
2093 QUOTA_FLAG_ENFORCEMENT_ENABLED = const_cpu_to_le32(0x00000020),
2094 QUOTA_FLAG_TRACKING_REQUESTED = const_cpu_to_le32(0x00000040),
2095 QUOTA_FLAG_LOG_THRESHOLD = const_cpu_to_le32(0x00000080),
2096
2097 QUOTA_FLAG_LOG_LIMIT = const_cpu_to_le32(0x00000100),
2098 QUOTA_FLAG_OUT_OF_DATE = const_cpu_to_le32(0x00000200),
2099 QUOTA_FLAG_CORRUPT = const_cpu_to_le32(0x00000400),
2100 QUOTA_FLAG_PENDING_DELETES = const_cpu_to_le32(0x00000800),
2101};
2102
2103typedef le32 QUOTA_FLAGS;
2104
2105/*
2106 * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas
2107 * are on a per volume and per user basis.
2108 *
2109 * The $Q index contains one entry for each existing user_id on the volume. The
2110 * index key is the user_id of the user/group owning this quota control entry,
2111 * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the
2112 * owner_id, is found in the standard information attribute. The collation rule
2113 * for $Q is COLLATION_NTOFS_ULONG.
2114 *
2115 * The $O index contains one entry for each user/group who has been assigned
2116 * a quota on that volume. The index key holds the SID of the user_id the
2117 * entry belongs to, i.e. the owner_id. The collation rule for $O is
2118 * COLLATION_NTOFS_SID.
2119 *
2120 * The $O index entry data is the user_id of the user corresponding to the SID.
2121 * This user_id is used as an index into $Q to find the quota control entry
2122 * associated with the SID.
2123 *
2124 * The $Q index entry data is the quota control entry and is defined below.
2125 */
2126typedef struct {
2127 le32 version; /* Currently equals 2. */
2128 QUOTA_FLAGS flags; /* Flags describing this quota entry. */
2129 le64 bytes_used; /* How many bytes of the quota are in use. */
2130 sle64 change_time; /* Last time this quota entry was changed. */
2131 sle64 threshold; /* Soft quota (-1 if not limited). */
2132 sle64 limit; /* Hard quota (-1 if not limited). */
2133 sle64 exceeded_time; /* How long the soft quota has been exceeded. */
2134 SID sid; /* The SID of the user/object associated with
2135 this quota entry. Equals zero for the quota
2136 defaults entry (and in fact on a WinXP
2137 volume, it is not present at all). */
2138} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY;
2139
2140/*
2141 * Predefined owner_id values (32-bit).
2142 */
2143enum {
2144 QUOTA_INVALID_ID = const_cpu_to_le32(0x00000000),
2145 QUOTA_DEFAULTS_ID = const_cpu_to_le32(0x00000001),
2146 QUOTA_FIRST_USER_ID = const_cpu_to_le32(0x00000100),
2147};
2148
2149/*
2150 * Current constants for quota control entries.
2151 */
2152typedef enum {
2153 /* Current version. */
2154 QUOTA_VERSION = 2,
2155} QUOTA_CONTROL_ENTRY_CONSTANTS;
2156
2157/*
2158 * Index entry flags (16-bit).
2159 */
2160enum {
2161 INDEX_ENTRY_NODE = const_cpu_to_le16(1), /* This entry contains a
2162 sub-node, i.e. a reference to an index block in form of
2163 a virtual cluster number (see below). */
2164 INDEX_ENTRY_END = const_cpu_to_le16(2), /* This signifies the last
2165 entry in an index block. The index entry does not
2166 represent a file but it can point to a sub-node. */
2167
2168 INDEX_ENTRY_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force
2169 enum bit width to 16-bit. */
2170} __attribute__ ((__packed__));
2171
2172typedef le16 INDEX_ENTRY_FLAGS;
2173
2174/*
2175 * This the index entry header (see below).
2176 */
2177typedef struct {
2178/* 0*/ union {
2179 struct { /* Only valid when INDEX_ENTRY_END is not set. */
2180 leMFT_REF indexed_file; /* The mft reference of the file
2181 described by this index
2182 entry. Used for directory
2183 indexes. */
2184 } __attribute__ ((__packed__)) dir;
2185 struct { /* Used for views/indexes to find the entry's data. */
2186 le16 data_offset; /* Data byte offset from this
2187 INDEX_ENTRY. Follows the
2188 index key. */
2189 le16 data_length; /* Data length in bytes. */
2190 le32 reservedV; /* Reserved (zero). */
2191 } __attribute__ ((__packed__)) vi;
2192 } __attribute__ ((__packed__)) data;
2193/* 8*/ le16 length; /* Byte size of this index entry, multiple of
2194 8-bytes. */
2195/* 10*/ le16 key_length; /* Byte size of the key value, which is in the
2196 index entry. It follows field reserved. Not
2197 multiple of 8-bytes. */
2198/* 12*/ INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
2199/* 14*/ le16 reserved; /* Reserved/align to 8-byte boundary. */
2200/* sizeof() = 16 bytes */
2201} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER;
2202
2203/*
2204 * This is an index entry. A sequence of such entries follows each INDEX_HEADER
2205 * structure. Together they make up a complete index. The index follows either
2206 * an index root attribute or an index allocation attribute.
2207 *
2208 * NOTE: Before NTFS 3.0 only filename attributes were indexed.
2209 */
2210typedef struct {
2211/*Ofs*/
2212/* 0 INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */
2213 union {
2214 struct { /* Only valid when INDEX_ENTRY_END is not set. */
2215 leMFT_REF indexed_file; /* The mft reference of the file
2216 described by this index
2217 entry. Used for directory
2218 indexes. */
2219 } __attribute__ ((__packed__)) dir;
2220 struct { /* Used for views/indexes to find the entry's data. */
2221 le16 data_offset; /* Data byte offset from this
2222 INDEX_ENTRY. Follows the
2223 index key. */
2224 le16 data_length; /* Data length in bytes. */
2225 le32 reservedV; /* Reserved (zero). */
2226 } __attribute__ ((__packed__)) vi;
2227 } __attribute__ ((__packed__)) data;
2228 le16 length; /* Byte size of this index entry, multiple of
2229 8-bytes. */
2230 le16 key_length; /* Byte size of the key value, which is in the
2231 index entry. It follows field reserved. Not
2232 multiple of 8-bytes. */
2233 INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
2234 le16 reserved; /* Reserved/align to 8-byte boundary. */
2235
2236/* 16*/ union { /* The key of the indexed attribute. NOTE: Only present
2237 if INDEX_ENTRY_END bit in flags is not set. NOTE: On
2238 NTFS versions before 3.0 the only valid key is the
2239 FILE_NAME_ATTR. On NTFS 3.0+ the following
2240 additional index keys are defined: */
2241 FILE_NAME_ATTR file_name;/* $I30 index in directories. */
2242 SII_INDEX_KEY sii; /* $SII index in $Secure. */
2243 SDH_INDEX_KEY sdh; /* $SDH index in $Secure. */
2244 GUID object_id; /* $O index in FILE_Extend/$ObjId: The
2245 object_id of the mft record found in
2246 the data part of the index. */
2247 REPARSE_INDEX_KEY reparse; /* $R index in
2248 FILE_Extend/$Reparse. */
2249 SID sid; /* $O index in FILE_Extend/$Quota:
2250 SID of the owner of the user_id. */
2251 le32 owner_id; /* $Q index in FILE_Extend/$Quota:
2252 user_id of the owner of the quota
2253 control entry in the data part of
2254 the index. */
2255 } __attribute__ ((__packed__)) key;
2256 /* The (optional) index data is inserted here when creating. */
2257 // leVCN vcn; /* If INDEX_ENTRY_NODE bit in flags is set, the last
2258 // eight bytes of this index entry contain the virtual
2259 // cluster number of the index block that holds the
2260 // entries immediately preceding the current entry (the
2261 // vcn references the corresponding cluster in the data
2262 // of the non-resident index allocation attribute). If
2263 // the key_length is zero, then the vcn immediately
2264 // follows the INDEX_ENTRY_HEADER. Regardless of
2265 // key_length, the address of the 8-byte boundary
2266 // alligned vcn of INDEX_ENTRY{_HEADER} *ie is given by
2267 // (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN),
2268 // where sizeof(VCN) can be hardcoded as 8 if wanted. */
2269} __attribute__ ((__packed__)) INDEX_ENTRY;
2270
2271/*
2272 * Attribute: Bitmap (0xb0).
2273 *
2274 * Contains an array of bits (aka a bitfield).
2275 *
2276 * When used in conjunction with the index allocation attribute, each bit
2277 * corresponds to one index block within the index allocation attribute. Thus
2278 * the number of bits in the bitmap * index block size / cluster size is the
2279 * number of clusters in the index allocation attribute.
2280 */
2281typedef struct {
2282 u8 bitmap[0]; /* Array of bits. */
2283} __attribute__ ((__packed__)) BITMAP_ATTR;
2284
2285/*
2286 * The reparse point tag defines the type of the reparse point. It also
2287 * includes several flags, which further describe the reparse point.
2288 *
2289 * The reparse point tag is an unsigned 32-bit value divided in three parts:
2290 *
2291 * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of
2292 * the reparse point.
2293 * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use.
2294 * 3. The most significant three bits are flags describing the reparse point.
2295 * They are defined as follows:
2296 * bit 29: Name surrogate bit. If set, the filename is an alias for
2297 * another object in the system.
2298 * bit 30: High-latency bit. If set, accessing the first byte of data will
2299 * be slow. (E.g. the data is stored on a tape drive.)
2300 * bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User
2301 * defined tags have to use zero here.
2302 *
2303 * These are the predefined reparse point tags:
2304 */
2305enum {
2306 IO_REPARSE_TAG_IS_ALIAS = const_cpu_to_le32(0x20000000),
2307 IO_REPARSE_TAG_IS_HIGH_LATENCY = const_cpu_to_le32(0x40000000),
2308 IO_REPARSE_TAG_IS_MICROSOFT = const_cpu_to_le32(0x80000000),
2309
2310 IO_REPARSE_TAG_RESERVED_ZERO = const_cpu_to_le32(0x00000000),
2311 IO_REPARSE_TAG_RESERVED_ONE = const_cpu_to_le32(0x00000001),
2312 IO_REPARSE_TAG_RESERVED_RANGE = const_cpu_to_le32(0x00000001),
2313
2314 IO_REPARSE_TAG_NSS = const_cpu_to_le32(0x68000005),
2315 IO_REPARSE_TAG_NSS_RECOVER = const_cpu_to_le32(0x68000006),
2316 IO_REPARSE_TAG_SIS = const_cpu_to_le32(0x68000007),
2317 IO_REPARSE_TAG_DFS = const_cpu_to_le32(0x68000008),
2318
2319 IO_REPARSE_TAG_MOUNT_POINT = const_cpu_to_le32(0x88000003),
2320
2321 IO_REPARSE_TAG_HSM = const_cpu_to_le32(0xa8000004),
2322
2323 IO_REPARSE_TAG_SYMBOLIC_LINK = const_cpu_to_le32(0xe8000000),
2324
2325 IO_REPARSE_TAG_VALID_VALUES = const_cpu_to_le32(0xe000ffff),
2326};
2327
2328/*
2329 * Attribute: Reparse point (0xc0).
2330 *
2331 * NOTE: Can be resident or non-resident.
2332 */
2333typedef struct {
2334 le32 reparse_tag; /* Reparse point type (inc. flags). */
2335 le16 reparse_data_length; /* Byte size of reparse data. */
2336 le16 reserved; /* Align to 8-byte boundary. */
2337 u8 reparse_data[0]; /* Meaning depends on reparse_tag. */
2338} __attribute__ ((__packed__)) REPARSE_POINT;
2339
2340/*
2341 * Attribute: Extended attribute (EA) information (0xd0).
2342 *
2343 * NOTE: Always resident. (Is this true???)
2344 */
2345typedef struct {
2346 le16 ea_length; /* Byte size of the packed extended
2347 attributes. */
2348 le16 need_ea_count; /* The number of extended attributes which have
2349 the NEED_EA bit set. */
2350 le32 ea_query_length; /* Byte size of the buffer required to query
2351 the extended attributes when calling
2352 ZwQueryEaFile() in Windows NT/2k. I.e. the
2353 byte size of the unpacked extended
2354 attributes. */
2355} __attribute__ ((__packed__)) EA_INFORMATION;
2356
2357/*
2358 * Extended attribute flags (8-bit).
2359 */
2360enum {
2361 NEED_EA = 0x80
2362} __attribute__ ((__packed__));
2363
2364typedef u8 EA_FLAGS;
2365
2366/*
2367 * Attribute: Extended attribute (EA) (0xe0).
2368 *
2369 * NOTE: Always non-resident. (Is this true?)
2370 *
2371 * Like the attribute list and the index buffer list, the EA attribute value is
2372 * a sequence of EA_ATTR variable length records.
2373 *
2374 * FIXME: It appears weird that the EA name is not unicode. Is it true?
2375 */
2376typedef struct {
2377 le32 next_entry_offset; /* Offset to the next EA_ATTR. */
2378 EA_FLAGS flags; /* Flags describing the EA. */
2379 u8 ea_name_length; /* Length of the name of the EA in bytes. */
2380 le16 ea_value_length; /* Byte size of the EA's value. */
2381 u8 ea_name[0]; /* Name of the EA. */
2382 u8 ea_value[0]; /* The value of the EA. Immediately follows
2383 the name. */
2384} __attribute__ ((__packed__)) EA_ATTR;
2385
2386/*
2387 * Attribute: Property set (0xf0).
2388 *
2389 * Intended to support Native Structure Storage (NSS) - a feature removed from
2390 * NTFS 3.0 during beta testing.
2391 */
2392typedef struct {
2393 /* Irrelevant as feature unused. */
2394} __attribute__ ((__packed__)) PROPERTY_SET;
2395
2396/*
2397 * Attribute: Logged utility stream (0x100).
2398 *
2399 * NOTE: Can be resident or non-resident.
2400 *
2401 * Operations on this attribute are logged to the journal ($LogFile) like
2402 * normal metadata changes.
2403 *
2404 * Used by the Encrypting File System (EFS). All encrypted files have this
2405 * attribute with the name $EFS.
2406 */
2407typedef struct {
2408 /* Can be anything the creator chooses. */
2409 /* EFS uses it as follows: */
2410 // FIXME: Type this info, verifying it along the way. (AIA)
2411} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR;
2412
2413#endif /* _LINUX_NTFS_LAYOUT_H */
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
new file mode 100644
index 000000000000..23fd911078b1
--- /dev/null
+++ b/fs/ntfs/lcnalloc.c
@@ -0,0 +1,1002 @@
1/*
2 * lcnalloc.c - Cluster (de)allocation code. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2004 Anton Altaparmakov
5 *
6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published
8 * by the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program/include file is distributed in the hope that it will be
12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program (in the main directory of the Linux-NTFS
18 * distribution in the file COPYING); if not, write to the Free Software
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#ifdef NTFS_RW
23
24#include <linux/pagemap.h>
25
26#include "lcnalloc.h"
27#include "debug.h"
28#include "bitmap.h"
29#include "inode.h"
30#include "volume.h"
31#include "attrib.h"
32#include "malloc.h"
33#include "aops.h"
34#include "ntfs.h"
35
36/**
37 * ntfs_cluster_free_from_rl_nolock - free clusters from runlist
38 * @vol: mounted ntfs volume on which to free the clusters
39 * @rl: runlist describing the clusters to free
40 *
41 * Free all the clusters described by the runlist @rl on the volume @vol. In
42 * the case of an error being returned, at least some of the clusters were not
43 * freed.
44 *
45 * Return 0 on success and -errno on error.
46 *
47 * Locking: - The volume lcn bitmap must be locked for writing on entry and is
48 * left locked on return.
49 */
50int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
51 const runlist_element *rl)
52{
53 struct inode *lcnbmp_vi = vol->lcnbmp_ino;
54 int ret = 0;
55
56 ntfs_debug("Entering.");
57 for (; rl->length; rl++) {
58 int err;
59
60 if (rl->lcn < 0)
61 continue;
62 err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length);
63 if (unlikely(err && (!ret || ret == ENOMEM) && ret != err))
64 ret = err;
65 }
66 ntfs_debug("Done.");
67 return ret;
68}
69
70/**
71 * ntfs_cluster_alloc - allocate clusters on an ntfs volume
72 * @vol: mounted ntfs volume on which to allocate the clusters
73 * @start_vcn: vcn to use for the first allocated cluster
74 * @count: number of clusters to allocate
75 * @start_lcn: starting lcn at which to allocate the clusters (or -1 if none)
76 * @zone: zone from which to allocate the clusters
77 *
78 * Allocate @count clusters preferably starting at cluster @start_lcn or at the
79 * current allocator position if @start_lcn is -1, on the mounted ntfs volume
80 * @vol. @zone is either DATA_ZONE for allocation of normal clusters or
81 * MFT_ZONE for allocation of clusters for the master file table, i.e. the
82 * $MFT/$DATA attribute.
83 *
84 * @start_vcn specifies the vcn of the first allocated cluster. This makes
85 * merging the resulting runlist with the old runlist easier.
86 *
87 * You need to check the return value with IS_ERR(). If this is false, the
88 * function was successful and the return value is a runlist describing the
89 * allocated cluster(s). If IS_ERR() is true, the function failed and
90 * PTR_ERR() gives you the error code.
91 *
92 * Notes on the allocation algorithm
93 * =================================
94 *
95 * There are two data zones. First is the area between the end of the mft zone
96 * and the end of the volume, and second is the area between the start of the
97 * volume and the start of the mft zone. On unmodified/standard NTFS 1.x
98 * volumes, the second data zone does not exist due to the mft zone being
99 * expanded to cover the start of the volume in order to reserve space for the
100 * mft bitmap attribute.
101 *
102 * This is not the prettiest function but the complexity stems from the need of
103 * implementing the mft vs data zoned approach and from the fact that we have
104 * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we
105 * need to cope with crossing over boundaries of two buffers. Further, the
106 * fact that the allocator allows for caller supplied hints as to the location
107 * of where allocation should begin and the fact that the allocator keeps track
108 * of where in the data zones the next natural allocation should occur,
109 * contribute to the complexity of the function. But it should all be
110 * worthwhile, because this allocator should: 1) be a full implementation of
111 * the MFT zone approach used by Windows NT, 2) cause reduction in
112 * fragmentation, and 3) be speedy in allocations (the code is not optimized
113 * for speed, but the algorithm is, so further speed improvements are probably
114 * possible).
115 *
116 * FIXME: We should be monitoring cluster allocation and increment the MFT zone
117 * size dynamically but this is something for the future. We will just cause
118 * heavier fragmentation by not doing it and I am not even sure Windows would
119 * grow the MFT zone dynamically, so it might even be correct not to do this.
120 * The overhead in doing dynamic MFT zone expansion would be very large and
121 * unlikely worth the effort. (AIA)
122 *
123 * TODO: I have added in double the required zone position pointer wrap around
124 * logic which can be optimized to having only one of the two logic sets.
125 * However, having the double logic will work fine, but if we have only one of
126 * the sets and we get it wrong somewhere, then we get into trouble, so
127 * removing the duplicate logic requires _very_ careful consideration of _all_
128 * possible code paths. So at least for now, I am leaving the double logic -
129 * better safe than sorry... (AIA)
130 *
131 * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked
132 * on return.
133 * - This function takes the volume lcn bitmap lock for writing and
134 * modifies the bitmap contents.
135 */
136runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
137 const s64 count, const LCN start_lcn,
138 const NTFS_CLUSTER_ALLOCATION_ZONES zone)
139{
140 LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn;
141 LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size;
142 s64 clusters;
143 struct inode *lcnbmp_vi;
144 runlist_element *rl = NULL;
145 struct address_space *mapping;
146 struct page *page = NULL;
147 u8 *buf, *byte;
148 int err = 0, rlpos, rlsize, buf_size;
149 u8 pass, done_zones, search_zone, need_writeback = 0, bit;
150
151 ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn "
152 "0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn,
153 (unsigned long long)count,
154 (unsigned long long)start_lcn,
155 zone == MFT_ZONE ? "MFT" : "DATA");
156 BUG_ON(!vol);
157 lcnbmp_vi = vol->lcnbmp_ino;
158 BUG_ON(!lcnbmp_vi);
159 BUG_ON(start_vcn < 0);
160 BUG_ON(count < 0);
161 BUG_ON(start_lcn < -1);
162 BUG_ON(zone < FIRST_ZONE);
163 BUG_ON(zone > LAST_ZONE);
164
165 /* Return empty runlist if @count == 0 */
166 // FIXME: Do we want to just return NULL instead? (AIA)
167 if (!count) {
168 rl = ntfs_malloc_nofs(PAGE_SIZE);
169 if (!rl)
170 return ERR_PTR(-ENOMEM);
171 rl[0].vcn = start_vcn;
172 rl[0].lcn = LCN_RL_NOT_MAPPED;
173 rl[0].length = 0;
174 return rl;
175 }
176 /* Take the lcnbmp lock for writing. */
177 down_write(&vol->lcnbmp_lock);
178 /*
179 * If no specific @start_lcn was requested, use the current data zone
180 * position, otherwise use the requested @start_lcn but make sure it
181 * lies outside the mft zone. Also set done_zones to 0 (no zones done)
182 * and pass depending on whether we are starting inside a zone (1) or
183 * at the beginning of a zone (2). If requesting from the MFT_ZONE,
184 * we either start at the current position within the mft zone or at
185 * the specified position. If the latter is out of bounds then we start
186 * at the beginning of the MFT_ZONE.
187 */
188 done_zones = 0;
189 pass = 1;
190 /*
191 * zone_start and zone_end are the current search range. search_zone
192 * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of
193 * volume) and 4 for data zone 2 (start of volume till start of mft
194 * zone).
195 */
196 zone_start = start_lcn;
197 if (zone_start < 0) {
198 if (zone == DATA_ZONE)
199 zone_start = vol->data1_zone_pos;
200 else
201 zone_start = vol->mft_zone_pos;
202 if (!zone_start) {
203 /*
204 * Zone starts at beginning of volume which means a
205 * single pass is sufficient.
206 */
207 pass = 2;
208 }
209 } else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start &&
210 zone_start < vol->mft_zone_end) {
211 zone_start = vol->mft_zone_end;
212 /*
213 * Starting at beginning of data1_zone which means a single
214 * pass in this zone is sufficient.
215 */
216 pass = 2;
217 } else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start ||
218 zone_start >= vol->mft_zone_end)) {
219 zone_start = vol->mft_lcn;
220 if (!vol->mft_zone_end)
221 zone_start = 0;
222 /*
223 * Starting at beginning of volume which means a single pass
224 * is sufficient.
225 */
226 pass = 2;
227 }
228 if (zone == MFT_ZONE) {
229 zone_end = vol->mft_zone_end;
230 search_zone = 1;
231 } else /* if (zone == DATA_ZONE) */ {
232 /* Skip searching the mft zone. */
233 done_zones |= 1;
234 if (zone_start >= vol->mft_zone_end) {
235 zone_end = vol->nr_clusters;
236 search_zone = 2;
237 } else {
238 zone_end = vol->mft_zone_start;
239 search_zone = 4;
240 }
241 }
242 /*
243 * bmp_pos is the current bit position inside the bitmap. We use
244 * bmp_initial_pos to determine whether or not to do a zone switch.
245 */
246 bmp_pos = bmp_initial_pos = zone_start;
247
248 /* Loop until all clusters are allocated, i.e. clusters == 0. */
249 clusters = count;
250 rlpos = rlsize = 0;
251 mapping = lcnbmp_vi->i_mapping;
252 while (1) {
253 ntfs_debug("Start of outer while loop: done_zones 0x%x, "
254 "search_zone %i, pass %i, zone_start 0x%llx, "
255 "zone_end 0x%llx, bmp_initial_pos 0x%llx, "
256 "bmp_pos 0x%llx, rlpos %i, rlsize %i.",
257 done_zones, search_zone, pass,
258 (unsigned long long)zone_start,
259 (unsigned long long)zone_end,
260 (unsigned long long)bmp_initial_pos,
261 (unsigned long long)bmp_pos, rlpos, rlsize);
262 /* Loop until we run out of free clusters. */
263 last_read_pos = bmp_pos >> 3;
264 ntfs_debug("last_read_pos 0x%llx.",
265 (unsigned long long)last_read_pos);
266 if (last_read_pos > lcnbmp_vi->i_size) {
267 ntfs_debug("End of attribute reached. "
268 "Skipping to zone_pass_done.");
269 goto zone_pass_done;
270 }
271 if (likely(page)) {
272 if (need_writeback) {
273 ntfs_debug("Marking page dirty.");
274 flush_dcache_page(page);
275 set_page_dirty(page);
276 need_writeback = 0;
277 }
278 ntfs_unmap_page(page);
279 }
280 page = ntfs_map_page(mapping, last_read_pos >>
281 PAGE_CACHE_SHIFT);
282 if (IS_ERR(page)) {
283 err = PTR_ERR(page);
284 ntfs_error(vol->sb, "Failed to map page.");
285 goto out;
286 }
287 buf_size = last_read_pos & ~PAGE_CACHE_MASK;
288 buf = page_address(page) + buf_size;
289 buf_size = PAGE_CACHE_SIZE - buf_size;
290 if (unlikely(last_read_pos + buf_size > lcnbmp_vi->i_size))
291 buf_size = lcnbmp_vi->i_size - last_read_pos;
292 buf_size <<= 3;
293 lcn = bmp_pos & 7;
294 bmp_pos &= ~7;
295 ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, "
296 "bmp_pos 0x%llx, need_writeback %i.", buf_size,
297 (unsigned long long)lcn,
298 (unsigned long long)bmp_pos, need_writeback);
299 while (lcn < buf_size && lcn + bmp_pos < zone_end) {
300 byte = buf + (lcn >> 3);
301 ntfs_debug("In inner while loop: buf_size %i, "
302 "lcn 0x%llx, bmp_pos 0x%llx, "
303 "need_writeback %i, byte ofs 0x%x, "
304 "*byte 0x%x.", buf_size,
305 (unsigned long long)lcn,
306 (unsigned long long)bmp_pos,
307 need_writeback,
308 (unsigned int)(lcn >> 3),
309 (unsigned int)*byte);
310 /* Skip full bytes. */
311 if (*byte == 0xff) {
312 lcn = (lcn + 8) & ~7;
313 ntfs_debug("Continuing while loop 1.");
314 continue;
315 }
316 bit = 1 << (lcn & 7);
317 ntfs_debug("bit %i.", bit);
318 /* If the bit is already set, go onto the next one. */
319 if (*byte & bit) {
320 lcn++;
321 ntfs_debug("Continuing while loop 2.");
322 continue;
323 }
324 /*
325 * Allocate more memory if needed, including space for
326 * the terminator element.
327 * ntfs_malloc_nofs() operates on whole pages only.
328 */
329 if ((rlpos + 2) * sizeof(*rl) > rlsize) {
330 runlist_element *rl2;
331
332 ntfs_debug("Reallocating memory.");
333 if (!rl)
334 ntfs_debug("First free bit is at LCN "
335 "0x%llx.",
336 (unsigned long long)
337 (lcn + bmp_pos));
338 rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
339 if (unlikely(!rl2)) {
340 err = -ENOMEM;
341 ntfs_error(vol->sb, "Failed to "
342 "allocate memory.");
343 goto out;
344 }
345 memcpy(rl2, rl, rlsize);
346 ntfs_free(rl);
347 rl = rl2;
348 rlsize += PAGE_SIZE;
349 ntfs_debug("Reallocated memory, rlsize 0x%x.",
350 rlsize);
351 }
352 /* Allocate the bitmap bit. */
353 *byte |= bit;
354 /* We need to write this bitmap page to disk. */
355 need_writeback = 1;
356 ntfs_debug("*byte 0x%x, need_writeback is set.",
357 (unsigned int)*byte);
358 /*
359 * Coalesce with previous run if adjacent LCNs.
360 * Otherwise, append a new run.
361 */
362 ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), "
363 "prev_lcn 0x%llx, lcn 0x%llx, "
364 "bmp_pos 0x%llx, prev_run_len 0x%llx, "
365 "rlpos %i.",
366 (unsigned long long)(lcn + bmp_pos),
367 1ULL, (unsigned long long)prev_lcn,
368 (unsigned long long)lcn,
369 (unsigned long long)bmp_pos,
370 (unsigned long long)prev_run_len,
371 rlpos);
372 if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) {
373 ntfs_debug("Coalescing to run (lcn 0x%llx, "
374 "len 0x%llx).",
375 (unsigned long long)
376 rl[rlpos - 1].lcn,
377 (unsigned long long)
378 rl[rlpos - 1].length);
379 rl[rlpos - 1].length = ++prev_run_len;
380 ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), "
381 "prev_run_len 0x%llx.",
382 (unsigned long long)
383 rl[rlpos - 1].lcn,
384 (unsigned long long)
385 rl[rlpos - 1].length,
386 (unsigned long long)
387 prev_run_len);
388 } else {
389 if (likely(rlpos)) {
390 ntfs_debug("Adding new run, (previous "
391 "run lcn 0x%llx, "
392 "len 0x%llx).",
393 (unsigned long long)
394 rl[rlpos - 1].lcn,
395 (unsigned long long)
396 rl[rlpos - 1].length);
397 rl[rlpos].vcn = rl[rlpos - 1].vcn +
398 prev_run_len;
399 } else {
400 ntfs_debug("Adding new run, is first "
401 "run.");
402 rl[rlpos].vcn = start_vcn;
403 }
404 rl[rlpos].lcn = prev_lcn = lcn + bmp_pos;
405 rl[rlpos].length = prev_run_len = 1;
406 rlpos++;
407 }
408 /* Done? */
409 if (!--clusters) {
410 LCN tc;
411 /*
412 * Update the current zone position. Positions
413 * of already scanned zones have been updated
414 * during the respective zone switches.
415 */
416 tc = lcn + bmp_pos + 1;
417 ntfs_debug("Done. Updating current zone "
418 "position, tc 0x%llx, "
419 "search_zone %i.",
420 (unsigned long long)tc,
421 search_zone);
422 switch (search_zone) {
423 case 1:
424 ntfs_debug("Before checks, "
425 "vol->mft_zone_pos "
426 "0x%llx.",
427 (unsigned long long)
428 vol->mft_zone_pos);
429 if (tc >= vol->mft_zone_end) {
430 vol->mft_zone_pos =
431 vol->mft_lcn;
432 if (!vol->mft_zone_end)
433 vol->mft_zone_pos = 0;
434 } else if ((bmp_initial_pos >=
435 vol->mft_zone_pos ||
436 tc > vol->mft_zone_pos)
437 && tc >= vol->mft_lcn)
438 vol->mft_zone_pos = tc;
439 ntfs_debug("After checks, "
440 "vol->mft_zone_pos "
441 "0x%llx.",
442 (unsigned long long)
443 vol->mft_zone_pos);
444 break;
445 case 2:
446 ntfs_debug("Before checks, "
447 "vol->data1_zone_pos "
448 "0x%llx.",
449 (unsigned long long)
450 vol->data1_zone_pos);
451 if (tc >= vol->nr_clusters)
452 vol->data1_zone_pos =
453 vol->mft_zone_end;
454 else if ((bmp_initial_pos >=
455 vol->data1_zone_pos ||
456 tc > vol->data1_zone_pos)
457 && tc >= vol->mft_zone_end)
458 vol->data1_zone_pos = tc;
459 ntfs_debug("After checks, "
460 "vol->data1_zone_pos "
461 "0x%llx.",
462 (unsigned long long)
463 vol->data1_zone_pos);
464 break;
465 case 4:
466 ntfs_debug("Before checks, "
467 "vol->data2_zone_pos "
468 "0x%llx.",
469 (unsigned long long)
470 vol->data2_zone_pos);
471 if (tc >= vol->mft_zone_start)
472 vol->data2_zone_pos = 0;
473 else if (bmp_initial_pos >=
474 vol->data2_zone_pos ||
475 tc > vol->data2_zone_pos)
476 vol->data2_zone_pos = tc;
477 ntfs_debug("After checks, "
478 "vol->data2_zone_pos "
479 "0x%llx.",
480 (unsigned long long)
481 vol->data2_zone_pos);
482 break;
483 default:
484 BUG();
485 }
486 ntfs_debug("Finished. Going to out.");
487 goto out;
488 }
489 lcn++;
490 }
491 bmp_pos += buf_size;
492 ntfs_debug("After inner while loop: buf_size 0x%x, lcn "
493 "0x%llx, bmp_pos 0x%llx, need_writeback %i.",
494 buf_size, (unsigned long long)lcn,
495 (unsigned long long)bmp_pos, need_writeback);
496 if (bmp_pos < zone_end) {
497 ntfs_debug("Continuing outer while loop, "
498 "bmp_pos 0x%llx, zone_end 0x%llx.",
499 (unsigned long long)bmp_pos,
500 (unsigned long long)zone_end);
501 continue;
502 }
503zone_pass_done: /* Finished with the current zone pass. */
504 ntfs_debug("At zone_pass_done, pass %i.", pass);
505 if (pass == 1) {
506 /*
507 * Now do pass 2, scanning the first part of the zone
508 * we omitted in pass 1.
509 */
510 pass = 2;
511 zone_end = zone_start;
512 switch (search_zone) {
513 case 1: /* mft_zone */
514 zone_start = vol->mft_zone_start;
515 break;
516 case 2: /* data1_zone */
517 zone_start = vol->mft_zone_end;
518 break;
519 case 4: /* data2_zone */
520 zone_start = 0;
521 break;
522 default:
523 BUG();
524 }
525 /* Sanity check. */
526 if (zone_end < zone_start)
527 zone_end = zone_start;
528 bmp_pos = zone_start;
529 ntfs_debug("Continuing outer while loop, pass 2, "
530 "zone_start 0x%llx, zone_end 0x%llx, "
531 "bmp_pos 0x%llx.",
532 (unsigned long long)zone_start,
533 (unsigned long long)zone_end,
534 (unsigned long long)bmp_pos);
535 continue;
536 } /* pass == 2 */
537done_zones_check:
538 ntfs_debug("At done_zones_check, search_zone %i, done_zones "
539 "before 0x%x, done_zones after 0x%x.",
540 search_zone, done_zones,
541 done_zones | search_zone);
542 done_zones |= search_zone;
543 if (done_zones < 7) {
544 ntfs_debug("Switching zone.");
545 /* Now switch to the next zone we haven't done yet. */
546 pass = 1;
547 switch (search_zone) {
548 case 1:
549 ntfs_debug("Switching from mft zone to data1 "
550 "zone.");
551 /* Update mft zone position. */
552 if (rlpos) {
553 LCN tc;
554
555 ntfs_debug("Before checks, "
556 "vol->mft_zone_pos "
557 "0x%llx.",
558 (unsigned long long)
559 vol->mft_zone_pos);
560 tc = rl[rlpos - 1].lcn +
561 rl[rlpos - 1].length;
562 if (tc >= vol->mft_zone_end) {
563 vol->mft_zone_pos =
564 vol->mft_lcn;
565 if (!vol->mft_zone_end)
566 vol->mft_zone_pos = 0;
567 } else if ((bmp_initial_pos >=
568 vol->mft_zone_pos ||
569 tc > vol->mft_zone_pos)
570 && tc >= vol->mft_lcn)
571 vol->mft_zone_pos = tc;
572 ntfs_debug("After checks, "
573 "vol->mft_zone_pos "
574 "0x%llx.",
575 (unsigned long long)
576 vol->mft_zone_pos);
577 }
578 /* Switch from mft zone to data1 zone. */
579switch_to_data1_zone: search_zone = 2;
580 zone_start = bmp_initial_pos =
581 vol->data1_zone_pos;
582 zone_end = vol->nr_clusters;
583 if (zone_start == vol->mft_zone_end)
584 pass = 2;
585 if (zone_start >= zone_end) {
586 vol->data1_zone_pos = zone_start =
587 vol->mft_zone_end;
588 pass = 2;
589 }
590 break;
591 case 2:
592 ntfs_debug("Switching from data1 zone to "
593 "data2 zone.");
594 /* Update data1 zone position. */
595 if (rlpos) {
596 LCN tc;
597
598 ntfs_debug("Before checks, "
599 "vol->data1_zone_pos "
600 "0x%llx.",
601 (unsigned long long)
602 vol->data1_zone_pos);
603 tc = rl[rlpos - 1].lcn +
604 rl[rlpos - 1].length;
605 if (tc >= vol->nr_clusters)
606 vol->data1_zone_pos =
607 vol->mft_zone_end;
608 else if ((bmp_initial_pos >=
609 vol->data1_zone_pos ||
610 tc > vol->data1_zone_pos)
611 && tc >= vol->mft_zone_end)
612 vol->data1_zone_pos = tc;
613 ntfs_debug("After checks, "
614 "vol->data1_zone_pos "
615 "0x%llx.",
616 (unsigned long long)
617 vol->data1_zone_pos);
618 }
619 /* Switch from data1 zone to data2 zone. */
620 search_zone = 4;
621 zone_start = bmp_initial_pos =
622 vol->data2_zone_pos;
623 zone_end = vol->mft_zone_start;
624 if (!zone_start)
625 pass = 2;
626 if (zone_start >= zone_end) {
627 vol->data2_zone_pos = zone_start =
628 bmp_initial_pos = 0;
629 pass = 2;
630 }
631 break;
632 case 4:
633 ntfs_debug("Switching from data2 zone to "
634 "data1 zone.");
635 /* Update data2 zone position. */
636 if (rlpos) {
637 LCN tc;
638
639 ntfs_debug("Before checks, "
640 "vol->data2_zone_pos "
641 "0x%llx.",
642 (unsigned long long)
643 vol->data2_zone_pos);
644 tc = rl[rlpos - 1].lcn +
645 rl[rlpos - 1].length;
646 if (tc >= vol->mft_zone_start)
647 vol->data2_zone_pos = 0;
648 else if (bmp_initial_pos >=
649 vol->data2_zone_pos ||
650 tc > vol->data2_zone_pos)
651 vol->data2_zone_pos = tc;
652 ntfs_debug("After checks, "
653 "vol->data2_zone_pos "
654 "0x%llx.",
655 (unsigned long long)
656 vol->data2_zone_pos);
657 }
658 /* Switch from data2 zone to data1 zone. */
659 goto switch_to_data1_zone;
660 default:
661 BUG();
662 }
663 ntfs_debug("After zone switch, search_zone %i, "
664 "pass %i, bmp_initial_pos 0x%llx, "
665 "zone_start 0x%llx, zone_end 0x%llx.",
666 search_zone, pass,
667 (unsigned long long)bmp_initial_pos,
668 (unsigned long long)zone_start,
669 (unsigned long long)zone_end);
670 bmp_pos = zone_start;
671 if (zone_start == zone_end) {
672 ntfs_debug("Empty zone, going to "
673 "done_zones_check.");
674 /* Empty zone. Don't bother searching it. */
675 goto done_zones_check;
676 }
677 ntfs_debug("Continuing outer while loop.");
678 continue;
679 } /* done_zones == 7 */
680 ntfs_debug("All zones are finished.");
681 /*
682 * All zones are finished! If DATA_ZONE, shrink mft zone. If
683 * MFT_ZONE, we have really run out of space.
684 */
685 mft_zone_size = vol->mft_zone_end - vol->mft_zone_start;
686 ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end "
687 "0x%llx, mft_zone_size 0x%llx.",
688 (unsigned long long)vol->mft_zone_start,
689 (unsigned long long)vol->mft_zone_end,
690 (unsigned long long)mft_zone_size);
691 if (zone == MFT_ZONE || mft_zone_size <= 0) {
692 ntfs_debug("No free clusters left, going to out.");
693 /* Really no more space left on device. */
694 err = ENOSPC;
695 goto out;
696 } /* zone == DATA_ZONE && mft_zone_size > 0 */
697 ntfs_debug("Shrinking mft zone.");
698 zone_end = vol->mft_zone_end;
699 mft_zone_size >>= 1;
700 if (mft_zone_size > 0)
701 vol->mft_zone_end = vol->mft_zone_start + mft_zone_size;
702 else /* mft zone and data2 zone no longer exist. */
703 vol->data2_zone_pos = vol->mft_zone_start =
704 vol->mft_zone_end = 0;
705 if (vol->mft_zone_pos >= vol->mft_zone_end) {
706 vol->mft_zone_pos = vol->mft_lcn;
707 if (!vol->mft_zone_end)
708 vol->mft_zone_pos = 0;
709 }
710 bmp_pos = zone_start = bmp_initial_pos =
711 vol->data1_zone_pos = vol->mft_zone_end;
712 search_zone = 2;
713 pass = 2;
714 done_zones &= ~2;
715 ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, "
716 "vol->mft_zone_start 0x%llx, "
717 "vol->mft_zone_end 0x%llx, "
718 "vol->mft_zone_pos 0x%llx, search_zone 2, "
719 "pass 2, dones_zones 0x%x, zone_start 0x%llx, "
720 "zone_end 0x%llx, vol->data1_zone_pos 0x%llx, "
721 "continuing outer while loop.",
722 (unsigned long long)mft_zone_size,
723 (unsigned long long)vol->mft_zone_start,
724 (unsigned long long)vol->mft_zone_end,
725 (unsigned long long)vol->mft_zone_pos,
726 done_zones, (unsigned long long)zone_start,
727 (unsigned long long)zone_end,
728 (unsigned long long)vol->data1_zone_pos);
729 }
730 ntfs_debug("After outer while loop.");
731out:
732 ntfs_debug("At out.");
733 /* Add runlist terminator element. */
734 if (likely(rl)) {
735 rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length;
736 rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
737 rl[rlpos].length = 0;
738 }
739 if (likely(page && !IS_ERR(page))) {
740 if (need_writeback) {
741 ntfs_debug("Marking page dirty.");
742 flush_dcache_page(page);
743 set_page_dirty(page);
744 need_writeback = 0;
745 }
746 ntfs_unmap_page(page);
747 }
748 if (likely(!err)) {
749 up_write(&vol->lcnbmp_lock);
750 ntfs_debug("Done.");
751 return rl;
752 }
753 ntfs_error(vol->sb, "Failed to allocate clusters, aborting "
754 "(error %i).", err);
755 if (rl) {
756 int err2;
757
758 if (err == ENOSPC)
759 ntfs_debug("Not enough space to complete allocation, "
760 "err ENOSPC, first free lcn 0x%llx, "
761 "could allocate up to 0x%llx "
762 "clusters.",
763 (unsigned long long)rl[0].lcn,
764 (unsigned long long)count - clusters);
765 /* Deallocate all allocated clusters. */
766 ntfs_debug("Attempting rollback...");
767 err2 = ntfs_cluster_free_from_rl_nolock(vol, rl);
768 if (err2) {
769 ntfs_error(vol->sb, "Failed to rollback (error %i). "
770 "Leaving inconsistent metadata! "
771 "Unmount and run chkdsk.", err2);
772 NVolSetErrors(vol);
773 }
774 /* Free the runlist. */
775 ntfs_free(rl);
776 } else if (err == ENOSPC)
777 ntfs_debug("No space left at all, err = ENOSPC, "
778 "first free lcn = 0x%llx.",
779 (unsigned long long)vol->data1_zone_pos);
780 up_write(&vol->lcnbmp_lock);
781 return ERR_PTR(err);
782}
783
784/**
785 * __ntfs_cluster_free - free clusters on an ntfs volume
786 * @vi: vfs inode whose runlist describes the clusters to free
787 * @start_vcn: vcn in the runlist of @vi at which to start freeing clusters
788 * @count: number of clusters to free or -1 for all clusters
789 * @is_rollback: if TRUE this is a rollback operation
790 *
791 * Free @count clusters starting at the cluster @start_vcn in the runlist
792 * described by the vfs inode @vi.
793 *
794 * If @count is -1, all clusters from @start_vcn to the end of the runlist are
795 * deallocated. Thus, to completely free all clusters in a runlist, use
796 * @start_vcn = 0 and @count = -1.
797 *
798 * @is_rollback should always be FALSE, it is for internal use to rollback
799 * errors. You probably want to use ntfs_cluster_free() instead.
800 *
801 * Note, ntfs_cluster_free() does not modify the runlist at all, so the caller
802 * has to deal with it later.
803 *
804 * Return the number of deallocated clusters (not counting sparse ones) on
805 * success and -errno on error.
806 *
807 * Locking: - The runlist described by @vi must be unlocked on entry and is
808 * unlocked on return.
809 * - This function takes the runlist lock of @vi for reading and
810 * sometimes for writing and sometimes modifies the runlist.
811 * - The volume lcn bitmap must be unlocked on entry and is unlocked
812 * on return.
813 * - This function takes the volume lcn bitmap lock for writing and
814 * modifies the bitmap contents.
815 */
816s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
817 const BOOL is_rollback)
818{
819 s64 delta, to_free, total_freed, real_freed;
820 ntfs_inode *ni;
821 ntfs_volume *vol;
822 struct inode *lcnbmp_vi;
823 runlist_element *rl;
824 int err;
825
826 BUG_ON(!vi);
827 ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count "
828 "0x%llx.%s", vi->i_ino, (unsigned long long)start_vcn,
829 (unsigned long long)count,
830 is_rollback ? " (rollback)" : "");
831 ni = NTFS_I(vi);
832 vol = ni->vol;
833 lcnbmp_vi = vol->lcnbmp_ino;
834 BUG_ON(!lcnbmp_vi);
835 BUG_ON(start_vcn < 0);
836 BUG_ON(count < -1);
837 /*
838 * Lock the lcn bitmap for writing but only if not rolling back. We
839 * must hold the lock all the way including through rollback otherwise
840 * rollback is not possible because once we have cleared a bit and
841 * dropped the lock, anyone could have set the bit again, thus
842 * allocating the cluster for another use.
843 */
844 if (likely(!is_rollback))
845 down_write(&vol->lcnbmp_lock);
846
847 total_freed = real_freed = 0;
848
849 /* This returns with ni->runlist locked for reading on success. */
850 rl = ntfs_find_vcn(ni, start_vcn, FALSE);
851 if (IS_ERR(rl)) {
852 if (!is_rollback)
853 ntfs_error(vol->sb, "Failed to find first runlist "
854 "element (error %li), aborting.",
855 PTR_ERR(rl));
856 err = PTR_ERR(rl);
857 goto err_out;
858 }
859 if (unlikely(rl->lcn < LCN_HOLE)) {
860 if (!is_rollback)
861 ntfs_error(vol->sb, "First runlist element has "
862 "invalid lcn, aborting.");
863 err = -EIO;
864 goto unl_err_out;
865 }
866 /* Find the starting cluster inside the run that needs freeing. */
867 delta = start_vcn - rl->vcn;
868
869 /* The number of clusters in this run that need freeing. */
870 to_free = rl->length - delta;
871 if (count >= 0 && to_free > count)
872 to_free = count;
873
874 if (likely(rl->lcn >= 0)) {
875 /* Do the actual freeing of the clusters in this run. */
876 err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta,
877 to_free, likely(!is_rollback) ? 0 : 1);
878 if (unlikely(err)) {
879 if (!is_rollback)
880 ntfs_error(vol->sb, "Failed to clear first run "
881 "(error %i), aborting.", err);
882 goto unl_err_out;
883 }
884 /* We have freed @to_free real clusters. */
885 real_freed = to_free;
886 };
887 /* Go to the next run and adjust the number of clusters left to free. */
888 ++rl;
889 if (count >= 0)
890 count -= to_free;
891
892 /* Keep track of the total "freed" clusters, including sparse ones. */
893 total_freed = to_free;
894 /*
895 * Loop over the remaining runs, using @count as a capping value, and
896 * free them.
897 */
898 for (; rl->length && count != 0; ++rl) {
899 if (unlikely(rl->lcn < LCN_HOLE)) {
900 VCN vcn;
901
902 /*
903 * Attempt to map runlist, dropping runlist lock for
904 * the duration.
905 */
906 vcn = rl->vcn;
907 up_read(&ni->runlist.lock);
908 err = ntfs_map_runlist(ni, vcn);
909 if (err) {
910 if (!is_rollback)
911 ntfs_error(vol->sb, "Failed to map "
912 "runlist fragment.");
913 if (err == -EINVAL || err == -ENOENT)
914 err = -EIO;
915 goto err_out;
916 }
917 /*
918 * This returns with ni->runlist locked for reading on
919 * success.
920 */
921 rl = ntfs_find_vcn(ni, vcn, FALSE);
922 if (IS_ERR(rl)) {
923 err = PTR_ERR(rl);
924 if (!is_rollback)
925 ntfs_error(vol->sb, "Failed to find "
926 "subsequent runlist "
927 "element.");
928 goto err_out;
929 }
930 if (unlikely(rl->lcn < LCN_HOLE)) {
931 if (!is_rollback)
932 ntfs_error(vol->sb, "Runlist element "
933 "has invalid lcn "
934 "(0x%llx).",
935 (unsigned long long)
936 rl->lcn);
937 err = -EIO;
938 goto unl_err_out;
939 }
940 }
941 /* The number of clusters in this run that need freeing. */
942 to_free = rl->length;
943 if (count >= 0 && to_free > count)
944 to_free = count;
945
946 if (likely(rl->lcn >= 0)) {
947 /* Do the actual freeing of the clusters in the run. */
948 err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn,
949 to_free, likely(!is_rollback) ? 0 : 1);
950 if (unlikely(err)) {
951 if (!is_rollback)
952 ntfs_error(vol->sb, "Failed to clear "
953 "subsequent run.");
954 goto unl_err_out;
955 }
956 /* We have freed @to_free real clusters. */
957 real_freed += to_free;
958 }
959 /* Adjust the number of clusters left to free. */
960 if (count >= 0)
961 count -= to_free;
962
963 /* Update the total done clusters. */
964 total_freed += to_free;
965 }
966 up_read(&ni->runlist.lock);
967 if (likely(!is_rollback))
968 up_write(&vol->lcnbmp_lock);
969
970 BUG_ON(count > 0);
971
972 /* We are done. Return the number of actually freed clusters. */
973 ntfs_debug("Done.");
974 return real_freed;
975unl_err_out:
976 up_read(&ni->runlist.lock);
977err_out:
978 if (is_rollback)
979 return err;
980 /* If no real clusters were freed, no need to rollback. */
981 if (!real_freed) {
982 up_write(&vol->lcnbmp_lock);
983 return err;
984 }
985 /*
986 * Attempt to rollback and if that succeeds just return the error code.
987 * If rollback fails, set the volume errors flag, emit an error
988 * message, and return the error code.
989 */
990 delta = __ntfs_cluster_free(vi, start_vcn, total_freed, TRUE);
991 if (delta < 0) {
992 ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving "
993 "inconsistent metadata! Unmount and run "
994 "chkdsk.", (int)delta);
995 NVolSetErrors(vol);
996 }
997 up_write(&vol->lcnbmp_lock);
998 ntfs_error(vol->sb, "Aborting (error %i).", err);
999 return err;
1000}
1001
1002#endif /* NTFS_RW */
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
new file mode 100644
index 000000000000..4cac1c024af6
--- /dev/null
+++ b/fs/ntfs/lcnalloc.h
@@ -0,0 +1,112 @@
1/*
2 * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation. Part of the
3 * Linux-NTFS project.
4 *
5 * Copyright (c) 2004 Anton Altaparmakov
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#ifndef _LINUX_NTFS_LCNALLOC_H
24#define _LINUX_NTFS_LCNALLOC_H
25
26#ifdef NTFS_RW
27
28#include <linux/fs.h>
29
30#include "types.h"
31#include "runlist.h"
32#include "volume.h"
33
34typedef enum {
35 FIRST_ZONE = 0, /* For sanity checking. */
36 MFT_ZONE = 0, /* Allocate from $MFT zone. */
37 DATA_ZONE = 1, /* Allocate from $DATA zone. */
38 LAST_ZONE = 1, /* For sanity checking. */
39} NTFS_CLUSTER_ALLOCATION_ZONES;
40
41extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
42 const VCN start_vcn, const s64 count, const LCN start_lcn,
43 const NTFS_CLUSTER_ALLOCATION_ZONES zone);
44
45extern s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
46 s64 count, const BOOL is_rollback);
47
48/**
49 * ntfs_cluster_free - free clusters on an ntfs volume
50 * @vi: vfs inode whose runlist describes the clusters to free
51 * @start_vcn: vcn in the runlist of @vi at which to start freeing clusters
52 * @count: number of clusters to free or -1 for all clusters
53 *
54 * Free @count clusters starting at the cluster @start_vcn in the runlist
55 * described by the vfs inode @vi.
56 *
57 * If @count is -1, all clusters from @start_vcn to the end of the runlist are
58 * deallocated. Thus, to completely free all clusters in a runlist, use
59 * @start_vcn = 0 and @count = -1.
60 *
61 * Note, ntfs_cluster_free() does not modify the runlist at all, so the caller
62 * has to deal with it later.
63 *
64 * Return the number of deallocated clusters (not counting sparse ones) on
65 * success and -errno on error.
66 *
67 * Locking: - The runlist described by @vi must be unlocked on entry and is
68 * unlocked on return.
69 * - This function takes the runlist lock of @vi for reading and
70 * sometimes for writing and sometimes modifies the runlist.
71 * - The volume lcn bitmap must be unlocked on entry and is unlocked
72 * on return.
73 * - This function takes the volume lcn bitmap lock for writing and
74 * modifies the bitmap contents.
75 */
76static inline s64 ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
77 s64 count)
78{
79 return __ntfs_cluster_free(vi, start_vcn, count, FALSE);
80}
81
82extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
83 const runlist_element *rl);
84
85/**
86 * ntfs_cluster_free_from_rl - free clusters from runlist
87 * @vol: mounted ntfs volume on which to free the clusters
88 * @rl: runlist describing the clusters to free
89 *
90 * Free all the clusters described by the runlist @rl on the volume @vol. In
91 * the case of an error being returned, at least some of the clusters were not
92 * freed.
93 *
94 * Return 0 on success and -errno on error.
95 *
96 * Locking: This function takes the volume lcn bitmap lock for writing and
97 * modifies the bitmap contents.
98 */
99static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol,
100 const runlist_element *rl)
101{
102 int ret;
103
104 down_write(&vol->lcnbmp_lock);
105 ret = ntfs_cluster_free_from_rl_nolock(vol, rl);
106 up_write(&vol->lcnbmp_lock);
107 return ret;
108}
109
110#endif /* NTFS_RW */
111
112#endif /* defined _LINUX_NTFS_LCNALLOC_H */
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
new file mode 100644
index 000000000000..5e280abafab3
--- /dev/null
+++ b/fs/ntfs/logfile.c
@@ -0,0 +1,705 @@
1/*
2 * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2002-2004 Anton Altaparmakov
5 *
6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published
8 * by the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program/include file is distributed in the hope that it will be
12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program (in the main directory of the Linux-NTFS
18 * distribution in the file COPYING); if not, write to the Free Software
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#ifdef NTFS_RW
23
24#include <linux/types.h>
25#include <linux/fs.h>
26#include <linux/highmem.h>
27#include <linux/buffer_head.h>
28#include <linux/bitops.h>
29
30#include "attrib.h"
31#include "aops.h"
32#include "debug.h"
33#include "logfile.h"
34#include "malloc.h"
35#include "volume.h"
36#include "ntfs.h"
37
38/**
39 * ntfs_check_restart_page_header - check the page header for consistency
40 * @vi: $LogFile inode to which the restart page header belongs
41 * @rp: restart page header to check
42 * @pos: position in @vi at which the restart page header resides
43 *
44 * Check the restart page header @rp for consistency and return TRUE if it is
45 * consistent and FALSE otherwise.
46 *
47 * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
48 * require the full restart page.
49 */
50static BOOL ntfs_check_restart_page_header(struct inode *vi,
51 RESTART_PAGE_HEADER *rp, s64 pos)
52{
53 u32 logfile_system_page_size, logfile_log_page_size;
54 u16 usa_count, usa_ofs, usa_end, ra_ofs;
55
56 ntfs_debug("Entering.");
57 /*
58 * If the system or log page sizes are smaller than the ntfs block size
59 * or either is not a power of 2 we cannot handle this log file.
60 */
61 logfile_system_page_size = le32_to_cpu(rp->system_page_size);
62 logfile_log_page_size = le32_to_cpu(rp->log_page_size);
63 if (logfile_system_page_size < NTFS_BLOCK_SIZE ||
64 logfile_log_page_size < NTFS_BLOCK_SIZE ||
65 logfile_system_page_size &
66 (logfile_system_page_size - 1) ||
67 logfile_log_page_size & (logfile_log_page_size - 1)) {
68 ntfs_error(vi->i_sb, "$LogFile uses unsupported page size.");
69 return FALSE;
70 }
71 /*
72 * We must be either at !pos (1st restart page) or at pos = system page
73 * size (2nd restart page).
74 */
75 if (pos && pos != logfile_system_page_size) {
76 ntfs_error(vi->i_sb, "Found restart area in incorrect "
77 "position in $LogFile.");
78 return FALSE;
79 }
80 /* We only know how to handle version 1.1. */
81 if (sle16_to_cpu(rp->major_ver) != 1 ||
82 sle16_to_cpu(rp->minor_ver) != 1) {
83 ntfs_error(vi->i_sb, "$LogFile version %i.%i is not "
84 "supported. (This driver supports version "
85 "1.1 only.)", (int)sle16_to_cpu(rp->major_ver),
86 (int)sle16_to_cpu(rp->minor_ver));
87 return FALSE;
88 }
89 /* Verify the size of the update sequence array. */
90 usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS);
91 if (usa_count != le16_to_cpu(rp->usa_count)) {
92 ntfs_error(vi->i_sb, "$LogFile restart page specifies "
93 "inconsistent update sequence array count.");
94 return FALSE;
95 }
96 /* Verify the position of the update sequence array. */
97 usa_ofs = le16_to_cpu(rp->usa_ofs);
98 usa_end = usa_ofs + usa_count * sizeof(u16);
99 if (usa_ofs < sizeof(RESTART_PAGE_HEADER) ||
100 usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) {
101 ntfs_error(vi->i_sb, "$LogFile restart page specifies "
102 "inconsistent update sequence array offset.");
103 return FALSE;
104 }
105 /*
106 * Verify the position of the restart area. It must be:
107 * - aligned to 8-byte boundary,
108 * - after the update sequence array, and
109 * - within the system page size.
110 */
111 ra_ofs = le16_to_cpu(rp->restart_area_offset);
112 if (ra_ofs & 7 || ra_ofs < usa_end ||
113 ra_ofs > logfile_system_page_size) {
114 ntfs_error(vi->i_sb, "$LogFile restart page specifies "
115 "inconsistent restart area offset.");
116 return FALSE;
117 }
118 /*
119 * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn
120 * set.
121 */
122 if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) {
123 ntfs_error(vi->i_sb, "$LogFile restart page is not modified "
124 "chkdsk but a chkdsk LSN is specified.");
125 return FALSE;
126 }
127 ntfs_debug("Done.");
128 return TRUE;
129}
130
131/**
132 * ntfs_check_restart_area - check the restart area for consistency
133 * @vi: $LogFile inode to which the restart page belongs
134 * @rp: restart page whose restart area to check
135 *
136 * Check the restart area of the restart page @rp for consistency and return
137 * TRUE if it is consistent and FALSE otherwise.
138 *
139 * This function assumes that the restart page header has already been
140 * consistency checked.
141 *
142 * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
143 * require the full restart page.
144 */
145static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
146{
147 u64 file_size;
148 RESTART_AREA *ra;
149 u16 ra_ofs, ra_len, ca_ofs;
150 u8 fs_bits;
151
152 ntfs_debug("Entering.");
153 ra_ofs = le16_to_cpu(rp->restart_area_offset);
154 ra = (RESTART_AREA*)((u8*)rp + ra_ofs);
155 /*
156 * Everything before ra->file_size must be before the first word
157 * protected by an update sequence number. This ensures that it is
158 * safe to access ra->client_array_offset.
159 */
160 if (ra_ofs + offsetof(RESTART_AREA, file_size) >
161 NTFS_BLOCK_SIZE - sizeof(u16)) {
162 ntfs_error(vi->i_sb, "$LogFile restart area specifies "
163 "inconsistent file offset.");
164 return FALSE;
165 }
166 /*
167 * Now that we can access ra->client_array_offset, make sure everything
168 * up to the log client array is before the first word protected by an
169 * update sequence number. This ensures we can access all of the
170 * restart area elements safely. Also, the client array offset must be
171 * aligned to an 8-byte boundary.
172 */
173 ca_ofs = le16_to_cpu(ra->client_array_offset);
174 if (((ca_ofs + 7) & ~7) != ca_ofs ||
175 ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) {
176 ntfs_error(vi->i_sb, "$LogFile restart area specifies "
177 "inconsistent client array offset.");
178 return FALSE;
179 }
180 /*
181 * The restart area must end within the system page size both when
182 * calculated manually and as specified by ra->restart_area_length.
183 * Also, the calculated length must not exceed the specified length.
184 */
185 ra_len = ca_ofs + le16_to_cpu(ra->log_clients) *
186 sizeof(LOG_CLIENT_RECORD);
187 if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) ||
188 ra_ofs + le16_to_cpu(ra->restart_area_length) >
189 le32_to_cpu(rp->system_page_size) ||
190 ra_len > le16_to_cpu(ra->restart_area_length)) {
191 ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds "
192 "of the system page size specified by the "
193 "restart page header and/or the specified "
194 "restart area length is inconsistent.");
195 return FALSE;
196 }
197 /*
198 * The ra->client_free_list and ra->client_in_use_list must be either
199 * LOGFILE_NO_CLIENT or less than ra->log_clients or they are
200 * overflowing the client array.
201 */
202 if ((ra->client_free_list != LOGFILE_NO_CLIENT &&
203 le16_to_cpu(ra->client_free_list) >=
204 le16_to_cpu(ra->log_clients)) ||
205 (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
206 le16_to_cpu(ra->client_in_use_list) >=
207 le16_to_cpu(ra->log_clients))) {
208 ntfs_error(vi->i_sb, "$LogFile restart area specifies "
209 "overflowing client free and/or in use lists.");
210 return FALSE;
211 }
212 /*
213 * Check ra->seq_number_bits against ra->file_size for consistency.
214 * We cannot just use ffs() because the file size is not a power of 2.
215 */
216 file_size = (u64)sle64_to_cpu(ra->file_size);
217 fs_bits = 0;
218 while (file_size) {
219 file_size >>= 1;
220 fs_bits++;
221 }
222 if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) {
223 ntfs_error(vi->i_sb, "$LogFile restart area specifies "
224 "inconsistent sequence number bits.");
225 return FALSE;
226 }
227 /* The log record header length must be a multiple of 8. */
228 if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) !=
229 le16_to_cpu(ra->log_record_header_length)) {
230 ntfs_error(vi->i_sb, "$LogFile restart area specifies "
231 "inconsistent log record header length.");
232 return FALSE;
233 }
234 /* Dito for the log page data offset. */
235 if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) !=
236 le16_to_cpu(ra->log_page_data_offset)) {
237 ntfs_error(vi->i_sb, "$LogFile restart area specifies "
238 "inconsistent log page data offset.");
239 return FALSE;
240 }
241 ntfs_debug("Done.");
242 return TRUE;
243}
244
245/**
246 * ntfs_check_log_client_array - check the log client array for consistency
247 * @vi: $LogFile inode to which the restart page belongs
248 * @rp: restart page whose log client array to check
249 *
250 * Check the log client array of the restart page @rp for consistency and
251 * return TRUE if it is consistent and FALSE otherwise.
252 *
253 * This function assumes that the restart page header and the restart area have
254 * already been consistency checked.
255 *
256 * Unlike ntfs_check_restart_page_header() and ntfs_check_restart_area(), this
257 * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full
258 * restart page and the page must be multi sector transfer deprotected.
259 */
260static BOOL ntfs_check_log_client_array(struct inode *vi,
261 RESTART_PAGE_HEADER *rp)
262{
263 RESTART_AREA *ra;
264 LOG_CLIENT_RECORD *ca, *cr;
265 u16 nr_clients, idx;
266 BOOL in_free_list, idx_is_first;
267
268 ntfs_debug("Entering.");
269 ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
270 ca = (LOG_CLIENT_RECORD*)((u8*)ra +
271 le16_to_cpu(ra->client_array_offset));
272 /*
273 * Check the ra->client_free_list first and then check the
274 * ra->client_in_use_list. Check each of the log client records in
275 * each of the lists and check that the array does not overflow the
276 * ra->log_clients value. Also keep track of the number of records
277 * visited as there cannot be more than ra->log_clients records and
278 * that way we detect eventual loops in within a list.
279 */
280 nr_clients = le16_to_cpu(ra->log_clients);
281 idx = le16_to_cpu(ra->client_free_list);
282 in_free_list = TRUE;
283check_list:
284 for (idx_is_first = TRUE; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--,
285 idx = le16_to_cpu(cr->next_client)) {
286 if (!nr_clients || idx >= le16_to_cpu(ra->log_clients))
287 goto err_out;
288 /* Set @cr to the current log client record. */
289 cr = ca + idx;
290 /* The first log client record must not have a prev_client. */
291 if (idx_is_first) {
292 if (cr->prev_client != LOGFILE_NO_CLIENT)
293 goto err_out;
294 idx_is_first = FALSE;
295 }
296 }
297 /* Switch to and check the in use list if we just did the free list. */
298 if (in_free_list) {
299 in_free_list = FALSE;
300 idx = le16_to_cpu(ra->client_in_use_list);
301 goto check_list;
302 }
303 ntfs_debug("Done.");
304 return TRUE;
305err_out:
306 ntfs_error(vi->i_sb, "$LogFile log client array is corrupt.");
307 return FALSE;
308}
309
310/**
311 * ntfs_check_and_load_restart_page - check the restart page for consistency
312 * @vi: $LogFile inode to which the restart page belongs
313 * @rp: restart page to check
314 * @pos: position in @vi at which the restart page resides
315 * @wrp: copy of the multi sector transfer deprotected restart page
316 *
317 * Check the restart page @rp for consistency and return TRUE if it is
318 * consistent and FALSE otherwise.
319 *
320 * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
321 * require the full restart page.
322 *
323 * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a
324 * copy of the complete multi sector transfer deprotected page. On failure,
325 * *@wrp is undefined.
326 */
327static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
328 RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp)
329{
330 RESTART_AREA *ra;
331 RESTART_PAGE_HEADER *trp;
332 int size;
333 BOOL ret;
334
335 ntfs_debug("Entering.");
336 /* Check the restart page header for consistency. */
337 if (!ntfs_check_restart_page_header(vi, rp, pos)) {
338 /* Error output already done inside the function. */
339 return FALSE;
340 }
341 /* Check the restart area for consistency. */
342 if (!ntfs_check_restart_area(vi, rp)) {
343 /* Error output already done inside the function. */
344 return FALSE;
345 }
346 ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
347 /*
348 * Allocate a buffer to store the whole restart page so we can multi
349 * sector transfer deprotect it.
350 */
351 trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size));
352 if (!trp) {
353 ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile "
354 "restart page buffer.");
355 return FALSE;
356 }
357 /*
358 * Read the whole of the restart page into the buffer. If it fits
359 * completely inside @rp, just copy it from there. Otherwise map all
360 * the required pages and copy the data from them.
361 */
362 size = PAGE_CACHE_SIZE - (pos & ~PAGE_CACHE_MASK);
363 if (size >= le32_to_cpu(rp->system_page_size)) {
364 memcpy(trp, rp, le32_to_cpu(rp->system_page_size));
365 } else {
366 pgoff_t idx;
367 struct page *page;
368 int have_read, to_read;
369
370 /* First copy what we already have in @rp. */
371 memcpy(trp, rp, size);
372 /* Copy the remaining data one page at a time. */
373 have_read = size;
374 to_read = le32_to_cpu(rp->system_page_size) - size;
375 idx = (pos + size) >> PAGE_CACHE_SHIFT;
376 BUG_ON((pos + size) & ~PAGE_CACHE_MASK);
377 do {
378 page = ntfs_map_page(vi->i_mapping, idx);
379 if (IS_ERR(page)) {
380 ntfs_error(vi->i_sb, "Error mapping $LogFile "
381 "page (index %lu).", idx);
382 goto err_out;
383 }
384 size = min_t(int, to_read, PAGE_CACHE_SIZE);
385 memcpy((u8*)trp + have_read, page_address(page), size);
386 ntfs_unmap_page(page);
387 have_read += size;
388 to_read -= size;
389 idx++;
390 } while (to_read > 0);
391 }
392 /* Perform the multi sector transfer deprotection on the buffer. */
393 if (post_read_mst_fixup((NTFS_RECORD*)trp,
394 le32_to_cpu(rp->system_page_size))) {
395 ntfs_error(vi->i_sb, "Multi sector transfer error detected in "
396 "$LogFile restart page.");
397 goto err_out;
398 }
399 /* Check the log client records for consistency. */
400 ret = ntfs_check_log_client_array(vi, trp);
401 if (ret && wrp)
402 *wrp = trp;
403 else
404 ntfs_free(trp);
405 ntfs_debug("Done.");
406 return ret;
407err_out:
408 ntfs_free(trp);
409 return FALSE;
410}
411
412/**
413 * ntfs_ckeck_logfile - check in the journal if the volume is consistent
414 * @log_vi: struct inode of loaded journal $LogFile to check
415 *
416 * Check the $LogFile journal for consistency and return TRUE if it is
417 * consistent and FALSE if not.
418 *
419 * At present we only check the two restart pages and ignore the log record
420 * pages.
421 *
422 * Note that the MstProtected flag is not set on the $LogFile inode and hence
423 * when reading pages they are not deprotected. This is because we do not know
424 * if the $LogFile was created on a system with a different page size to ours
425 * yet and mst deprotection would fail if our page size is smaller.
426 */
427BOOL ntfs_check_logfile(struct inode *log_vi)
428{
429 s64 size, pos, rstr1_pos, rstr2_pos;
430 ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
431 struct address_space *mapping = log_vi->i_mapping;
432 struct page *page = NULL;
433 u8 *kaddr = NULL;
434 RESTART_PAGE_HEADER *rstr1_ph = NULL;
435 RESTART_PAGE_HEADER *rstr2_ph = NULL;
436 int log_page_size, log_page_mask, ofs;
437 BOOL logfile_is_empty = TRUE;
438 BOOL rstr1_found = FALSE;
439 BOOL rstr2_found = FALSE;
440 u8 log_page_bits;
441
442 ntfs_debug("Entering.");
443 /* An empty $LogFile must have been clean before it got emptied. */
444 if (NVolLogFileEmpty(vol))
445 goto is_empty;
446 size = log_vi->i_size;
447 /* Make sure the file doesn't exceed the maximum allowed size. */
448 if (size > MaxLogFileSize)
449 size = MaxLogFileSize;
450 /*
451 * Truncate size to a multiple of the page cache size or the default
452 * log page size if the page cache size is between the default log page
453 * log page size if the page cache size is between the default log page
454 * size and twice that.
455 */
456 if (PAGE_CACHE_SIZE >= DefaultLogPageSize && PAGE_CACHE_SIZE <=
457 DefaultLogPageSize * 2)
458 log_page_size = DefaultLogPageSize;
459 else
460 log_page_size = PAGE_CACHE_SIZE;
461 log_page_mask = log_page_size - 1;
462 /*
463 * Use generic_ffs() instead of ffs() to enable the compiler to
464 * optimize log_page_size and log_page_bits into constants.
465 */
466 log_page_bits = generic_ffs(log_page_size) - 1;
467 size &= ~(log_page_size - 1);
468 /*
469 * Ensure the log file is big enough to store at least the two restart
470 * pages and the minimum number of log record pages.
471 */
472 if (size < log_page_size * 2 || (size - log_page_size * 2) >>
473 log_page_bits < MinLogRecordPages) {
474 ntfs_error(vol->sb, "$LogFile is too small.");
475 return FALSE;
476 }
477 /*
478 * Read through the file looking for a restart page. Since the restart
479 * page header is at the beginning of a page we only need to search at
480 * what could be the beginning of a page (for each page size) rather
481 * than scanning the whole file byte by byte. If all potential places
482 * contain empty and uninitialzed records, the log file can be assumed
483 * to be empty.
484 */
485 for (pos = 0; pos < size; pos <<= 1) {
486 pgoff_t idx = pos >> PAGE_CACHE_SHIFT;
487 if (!page || page->index != idx) {
488 if (page)
489 ntfs_unmap_page(page);
490 page = ntfs_map_page(mapping, idx);
491 if (IS_ERR(page)) {
492 ntfs_error(vol->sb, "Error mapping $LogFile "
493 "page (index %lu).", idx);
494 return FALSE;
495 }
496 }
497 kaddr = (u8*)page_address(page) + (pos & ~PAGE_CACHE_MASK);
498 /*
499 * A non-empty block means the logfile is not empty while an
500 * empty block after a non-empty block has been encountered
501 * means we are done.
502 */
503 if (!ntfs_is_empty_recordp((le32*)kaddr))
504 logfile_is_empty = FALSE;
505 else if (!logfile_is_empty)
506 break;
507 /*
508 * A log record page means there cannot be a restart page after
509 * this so no need to continue searching.
510 */
511 if (ntfs_is_rcrd_recordp((le32*)kaddr))
512 break;
513 /*
514 * A modified by chkdsk restart page means we cannot handle
515 * this log file.
516 */
517 if (ntfs_is_chkd_recordp((le32*)kaddr)) {
518 ntfs_error(vol->sb, "$LogFile has been modified by "
519 "chkdsk. Mount this volume in "
520 "Windows.");
521 goto err_out;
522 }
523 /* If not a restart page, continue. */
524 if (!ntfs_is_rstr_recordp((le32*)kaddr)) {
525 /* Skip to the minimum page size for the next one. */
526 if (!pos)
527 pos = NTFS_BLOCK_SIZE >> 1;
528 continue;
529 }
530 /* We now know we have a restart page. */
531 if (!pos) {
532 rstr1_found = TRUE;
533 rstr1_pos = pos;
534 } else {
535 if (rstr2_found) {
536 ntfs_error(vol->sb, "Found more than two "
537 "restart pages in $LogFile.");
538 goto err_out;
539 }
540 rstr2_found = TRUE;
541 rstr2_pos = pos;
542 }
543 /*
544 * Check the restart page for consistency and get a copy of the
545 * complete multi sector transfer deprotected restart page.
546 */
547 if (!ntfs_check_and_load_restart_page(log_vi,
548 (RESTART_PAGE_HEADER*)kaddr, pos,
549 !pos ? &rstr1_ph : &rstr2_ph)) {
550 /* Error output already done inside the function. */
551 goto err_out;
552 }
553 /*
554 * We have a valid restart page. The next one must be after
555 * a whole system page size as specified by the valid restart
556 * page.
557 */
558 if (!pos)
559 pos = le32_to_cpu(rstr1_ph->system_page_size) >> 1;
560 }
561 if (page) {
562 ntfs_unmap_page(page);
563 page = NULL;
564 }
565 if (logfile_is_empty) {
566 NVolSetLogFileEmpty(vol);
567is_empty:
568 ntfs_debug("Done. ($LogFile is empty.)");
569 return TRUE;
570 }
571 if (!rstr1_found || !rstr2_found) {
572 ntfs_error(vol->sb, "Did not find two restart pages in "
573 "$LogFile.");
574 goto err_out;
575 }
576 /*
577 * The two restart areas must be identical except for the update
578 * sequence number.
579 */
580 ofs = le16_to_cpu(rstr1_ph->usa_ofs);
581 if (memcmp(rstr1_ph, rstr2_ph, ofs) || (ofs += sizeof(u16),
582 memcmp((u8*)rstr1_ph + ofs, (u8*)rstr2_ph + ofs,
583 le32_to_cpu(rstr1_ph->system_page_size) - ofs))) {
584 ntfs_error(vol->sb, "The two restart pages in $LogFile do not "
585 "match.");
586 goto err_out;
587 }
588 ntfs_free(rstr1_ph);
589 ntfs_free(rstr2_ph);
590 /* All consistency checks passed. */
591 ntfs_debug("Done.");
592 return TRUE;
593err_out:
594 if (page)
595 ntfs_unmap_page(page);
596 if (rstr1_ph)
597 ntfs_free(rstr1_ph);
598 if (rstr2_ph)
599 ntfs_free(rstr2_ph);
600 return FALSE;
601}
602
603/**
604 * ntfs_is_logfile_clean - check in the journal if the volume is clean
605 * @log_vi: struct inode of loaded journal $LogFile to check
606 *
607 * Analyze the $LogFile journal and return TRUE if it indicates the volume was
608 * shutdown cleanly and FALSE if not.
609 *
610 * At present we only look at the two restart pages and ignore the log record
611 * pages. This is a little bit crude in that there will be a very small number
612 * of cases where we think that a volume is dirty when in fact it is clean.
613 * This should only affect volumes that have not been shutdown cleanly but did
614 * not have any pending, non-check-pointed i/o, i.e. they were completely idle
615 * at least for the five seconds preceeding the unclean shutdown.
616 *
617 * This function assumes that the $LogFile journal has already been consistency
618 * checked by a call to ntfs_check_logfile() and in particular if the $LogFile
619 * is empty this function requires that NVolLogFileEmpty() is true otherwise an
620 * empty volume will be reported as dirty.
621 */
622BOOL ntfs_is_logfile_clean(struct inode *log_vi)
623{
624 ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
625 struct page *page;
626 RESTART_PAGE_HEADER *rp;
627 RESTART_AREA *ra;
628
629 ntfs_debug("Entering.");
630 /* An empty $LogFile must have been clean before it got emptied. */
631 if (NVolLogFileEmpty(vol)) {
632 ntfs_debug("Done. ($LogFile is empty.)");
633 return TRUE;
634 }
635 /*
636 * Read the first restart page. It will be possibly incomplete and
637 * will not be multi sector transfer deprotected but we only need the
638 * first NTFS_BLOCK_SIZE bytes so it does not matter.
639 */
640 page = ntfs_map_page(log_vi->i_mapping, 0);
641 if (IS_ERR(page)) {
642 ntfs_error(vol->sb, "Error mapping $LogFile page (index 0).");
643 return FALSE;
644 }
645 rp = (RESTART_PAGE_HEADER*)page_address(page);
646 if (!ntfs_is_rstr_record(rp->magic)) {
647 ntfs_error(vol->sb, "No restart page found at offset zero in "
648 "$LogFile. This is probably a bug in that "
649 "the $LogFile should have been consistency "
650 "checked before calling this function.");
651 goto err_out;
652 }
653 ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
654 /*
655 * If the $LogFile has active clients, i.e. it is open, and we do not
656 * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags,
657 * we assume there was an unclean shutdown.
658 */
659 if (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
660 !(ra->flags & RESTART_VOLUME_IS_CLEAN)) {
661 ntfs_debug("Done. $LogFile indicates a dirty shutdown.");
662 goto err_out;
663 }
664 ntfs_unmap_page(page);
665 /* $LogFile indicates a clean shutdown. */
666 ntfs_debug("Done. $LogFile indicates a clean shutdown.");
667 return TRUE;
668err_out:
669 ntfs_unmap_page(page);
670 return FALSE;
671}
672
673/**
674 * ntfs_empty_logfile - empty the contents of the $LogFile journal
675 * @log_vi: struct inode of loaded journal $LogFile to empty
676 *
677 * Empty the contents of the $LogFile journal @log_vi and return TRUE on
678 * success and FALSE on error.
679 *
680 * This function assumes that the $LogFile journal has already been consistency
681 * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean()
682 * has been used to ensure that the $LogFile is clean.
683 */
684BOOL ntfs_empty_logfile(struct inode *log_vi)
685{
686 ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
687
688 ntfs_debug("Entering.");
689 if (!NVolLogFileEmpty(vol)) {
690 int err;
691
692 err = ntfs_attr_set(NTFS_I(log_vi), 0, log_vi->i_size, 0xff);
693 if (unlikely(err)) {
694 ntfs_error(vol->sb, "Failed to fill $LogFile with "
695 "0xff bytes (error code %i).", err);
696 return FALSE;
697 }
698 /* Set the flag so we do not have to do it again on remount. */
699 NVolSetLogFileEmpty(vol);
700 }
701 ntfs_debug("Done.");
702 return TRUE;
703}
704
705#endif /* NTFS_RW */
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
new file mode 100644
index 000000000000..4ee4378de061
--- /dev/null
+++ b/fs/ntfs/logfile.h
@@ -0,0 +1,307 @@
1/*
2 * logfile.h - Defines for NTFS kernel journal ($LogFile) handling. Part of
3 * the Linux-NTFS project.
4 *
5 * Copyright (c) 2000-2004 Anton Altaparmakov
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#ifndef _LINUX_NTFS_LOGFILE_H
24#define _LINUX_NTFS_LOGFILE_H
25
26#ifdef NTFS_RW
27
28#include <linux/fs.h>
29
30#include "types.h"
31#include "endian.h"
32#include "layout.h"
33
34/*
35 * Journal ($LogFile) organization:
36 *
37 * Two restart areas present in the first two pages (restart pages, one restart
38 * area in each page). When the volume is dismounted they should be identical,
39 * except for the update sequence array which usually has a different update
40 * sequence number.
41 *
42 * These are followed by log records organized in pages headed by a log record
43 * header going up to log file size. Not all pages contain log records when a
44 * volume is first formatted, but as the volume ages, all records will be used.
45 * When the log file fills up, the records at the beginning are purged (by
46 * modifying the oldest_lsn to a higher value presumably) and writing begins
47 * at the beginning of the file. Effectively, the log file is viewed as a
48 * circular entity.
49 *
50 * NOTE: Windows NT, 2000, and XP all use log file version 1.1 but they accept
51 * versions <= 1.x, including 0.-1. (Yes, that is a minus one in there!) We
52 * probably only want to support 1.1 as this seems to be the current version
53 * and we don't know how that differs from the older versions. The only
54 * exception is if the journal is clean as marked by the two restart pages
55 * then it doesn't matter whether we are on an earlier version. We can just
56 * reinitialize the logfile and start again with version 1.1.
57 */
58
59/* Some $LogFile related constants. */
60#define MaxLogFileSize 0x100000000ULL
61#define DefaultLogPageSize 4096
62#define MinLogRecordPages 48
63
64/*
65 * Log file restart page header (begins the restart area).
66 */
67typedef struct {
68/*Ofs*/
69/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
70/* 0*/ NTFS_RECORD_TYPE magic; /* The magic is "RSTR". */
71/* 4*/ le16 usa_ofs; /* See NTFS_RECORD definition in layout.h.
72 When creating, set this to be immediately
73 after this header structure (without any
74 alignment). */
75/* 6*/ le16 usa_count; /* See NTFS_RECORD definition in layout.h. */
76
77/* 8*/ leLSN chkdsk_lsn; /* The last log file sequence number found by
78 chkdsk. Only used when the magic is changed
79 to "CHKD". Otherwise this is zero. */
80/* 16*/ le32 system_page_size; /* Byte size of system pages when the log file
81 was created, has to be >= 512 and a power of
82 2. Use this to calculate the required size
83 of the usa (usa_count) and add it to usa_ofs.
84 Then verify that the result is less than the
85 value of the restart_area_offset. */
86/* 20*/ le32 log_page_size; /* Byte size of log file pages, has to be >=
87 512 and a power of 2. The default is 4096
88 and is used when the system page size is
89 between 4096 and 8192. Otherwise this is
90 set to the system page size instead. */
91/* 24*/ le16 restart_area_offset;/* Byte offset from the start of this header to
92 the RESTART_AREA. Value has to be aligned
93 to 8-byte boundary. When creating, set this
94 to be after the usa. */
95/* 26*/ sle16 minor_ver; /* Log file minor version. Only check if major
96 version is 1. */
97/* 28*/ sle16 major_ver; /* Log file major version. We only support
98 version 1.1. */
99/* sizeof() = 30 (0x1e) bytes */
100} __attribute__ ((__packed__)) RESTART_PAGE_HEADER;
101
102/*
103 * Constant for the log client indices meaning that there are no client records
104 * in this particular client array. Also inside the client records themselves,
105 * this means that there are no client records preceding or following this one.
106 */
107#define LOGFILE_NO_CLIENT const_cpu_to_le16(0xffff)
108#define LOGFILE_NO_CLIENT_CPU 0xffff
109
110/*
111 * These are the so far known RESTART_AREA_* flags (16-bit) which contain
112 * information about the log file in which they are present.
113 */
114enum {
115 RESTART_VOLUME_IS_CLEAN = const_cpu_to_le16(0x0002),
116 RESTART_SPACE_FILLER = 0xffff, /* gcc: Force enum bit width to 16. */
117} __attribute__ ((__packed__));
118
119typedef le16 RESTART_AREA_FLAGS;
120
121/*
122 * Log file restart area record. The offset of this record is found by adding
123 * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found
124 * in it. See notes at restart_area_offset above.
125 */
126typedef struct {
127/*Ofs*/
128/* 0*/ leLSN current_lsn; /* The current, i.e. last LSN inside the log
129 when the restart area was last written.
130 This happens often but what is the interval?
131 Is it just fixed time or is it every time a
132 check point is written or somethine else?
133 On create set to 0. */
134/* 8*/ le16 log_clients; /* Number of log client records in the array of
135 log client records which follows this
136 restart area. Must be 1. */
137/* 10*/ le16 client_free_list; /* The index of the first free log client record
138 in the array of log client records.
139 LOGFILE_NO_CLIENT means that there are no
140 free log client records in the array.
141 If != LOGFILE_NO_CLIENT, check that
142 log_clients > client_free_list. On Win2k
143 and presumably earlier, on a clean volume
144 this is != LOGFILE_NO_CLIENT, and it should
145 be 0, i.e. the first (and only) client
146 record is free and thus the logfile is
147 closed and hence clean. A dirty volume
148 would have left the logfile open and hence
149 this would be LOGFILE_NO_CLIENT. On WinXP
150 and presumably later, the logfile is always
151 open, even on clean shutdown so this should
152 always be LOGFILE_NO_CLIENT. */
153/* 12*/ le16 client_in_use_list;/* The index of the first in-use log client
154 record in the array of log client records.
155 LOGFILE_NO_CLIENT means that there are no
156 in-use log client records in the array. If
157 != LOGFILE_NO_CLIENT check that log_clients
158 > client_in_use_list. On Win2k and
159 presumably earlier, on a clean volume this
160 is LOGFILE_NO_CLIENT, i.e. there are no
161 client records in use and thus the logfile
162 is closed and hence clean. A dirty volume
163 would have left the logfile open and hence
164 this would be != LOGFILE_NO_CLIENT, and it
165 should be 0, i.e. the first (and only)
166 client record is in use. On WinXP and
167 presumably later, the logfile is always
168 open, even on clean shutdown so this should
169 always be 0. */
170/* 14*/ RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour. On Win2k
171 and presumably earlier this is always 0. On
172 WinXP and presumably later, if the logfile
173 was shutdown cleanly, the second bit,
174 RESTART_VOLUME_IS_CLEAN, is set. This bit
175 is cleared when the volume is mounted by
176 WinXP and set when the volume is dismounted,
177 thus if the logfile is dirty, this bit is
178 clear. Thus we don't need to check the
179 Windows version to determine if the logfile
180 is clean. Instead if the logfile is closed,
181 we know it must be clean. If it is open and
182 this bit is set, we also know it must be
183 clean. If on the other hand the logfile is
184 open and this bit is clear, we can be almost
185 certain that the logfile is dirty. */
186/* 16*/ le32 seq_number_bits; /* How many bits to use for the sequence
187 number. This is calculated as 67 - the
188 number of bits required to store the logfile
189 size in bytes and this can be used in with
190 the specified file_size as a consistency
191 check. */
192/* 20*/ le16 restart_area_length;/* Length of the restart area including the
193 client array. Following checks required if
194 version matches. Otherwise, skip them.
195 restart_area_offset + restart_area_length
196 has to be <= system_page_size. Also,
197 restart_area_length has to be >=
198 client_array_offset + (log_clients *
199 sizeof(log client record)). */
200/* 22*/ le16 client_array_offset;/* Offset from the start of this record to
201 the first log client record if versions are
202 matched. When creating, set this to be
203 after this restart area structure, aligned
204 to 8-bytes boundary. If the versions do not
205 match, this is ignored and the offset is
206 assumed to be (sizeof(RESTART_AREA) + 7) &
207 ~7, i.e. rounded up to first 8-byte
208 boundary. Either way, client_array_offset
209 has to be aligned to an 8-byte boundary.
210 Also, restart_area_offset +
211 client_array_offset has to be <= 510.
212 Finally, client_array_offset + (log_clients
213 * sizeof(log client record)) has to be <=
214 system_page_size. On Win2k and presumably
215 earlier, this is 0x30, i.e. immediately
216 following this record. On WinXP and
217 presumably later, this is 0x40, i.e. there
218 are 16 extra bytes between this record and
219 the client array. This probably means that
220 the RESTART_AREA record is actually bigger
221 in WinXP and later. */
222/* 24*/ sle64 file_size; /* Usable byte size of the log file. If the
223 restart_area_offset + the offset of the
224 file_size are > 510 then corruption has
225 occured. This is the very first check when
226 starting with the restart_area as if it
227 fails it means that some of the above values
228 will be corrupted by the multi sector
229 transfer protection. The file_size has to
230 be rounded down to be a multiple of the
231 log_page_size in the RESTART_PAGE_HEADER and
232 then it has to be at least big enough to
233 store the two restart pages and 48 (0x30)
234 log record pages. */
235/* 32*/ le32 last_lsn_data_length;/* Length of data of last LSN, not including
236 the log record header. On create set to
237 0. */
238/* 36*/ le16 log_record_header_length;/* Byte size of the log record header.
239 If the version matches then check that the
240 value of log_record_header_length is a
241 multiple of 8, i.e.
242 (log_record_header_length + 7) & ~7 ==
243 log_record_header_length. When creating set
244 it to sizeof(LOG_RECORD_HEADER), aligned to
245 8 bytes. */
246/* 38*/ le16 log_page_data_offset;/* Offset to the start of data in a log record
247 page. Must be a multiple of 8. On create
248 set it to immediately after the update
249 sequence array of the log record page. */
250/* 40*/ le32 restart_log_open_count;/* A counter that gets incremented every
251 time the logfile is restarted which happens
252 at mount time when the logfile is opened.
253 When creating set to a random value. Win2k
254 sets it to the low 32 bits of the current
255 system time in NTFS format (see time.h). */
256/* 44*/ le32 reserved; /* Reserved/alignment to 8-byte boundary. */
257/* sizeof() = 48 (0x30) bytes */
258} __attribute__ ((__packed__)) RESTART_AREA;
259
260/*
261 * Log client record. The offset of this record is found by adding the offset
262 * of the RESTART_AREA to the client_array_offset value found in it.
263 */
264typedef struct {
265/*Ofs*/
266/* 0*/ leLSN oldest_lsn; /* Oldest LSN needed by this client. On create
267 set to 0. */
268/* 8*/ leLSN client_restart_lsn;/* LSN at which this client needs to restart
269 the volume, i.e. the current position within
270 the log file. At present, if clean this
271 should = current_lsn in restart area but it
272 probably also = current_lsn when dirty most
273 of the time. At create set to 0. */
274/* 16*/ le16 prev_client; /* The offset to the previous log client record
275 in the array of log client records.
276 LOGFILE_NO_CLIENT means there is no previous
277 client record, i.e. this is the first one.
278 This is always LOGFILE_NO_CLIENT. */
279/* 18*/ le16 next_client; /* The offset to the next log client record in
280 the array of log client records.
281 LOGFILE_NO_CLIENT means there are no next
282 client records, i.e. this is the last one.
283 This is always LOGFILE_NO_CLIENT. */
284/* 20*/ le16 seq_number; /* On Win2k and presumably earlier, this is set
285 to zero every time the logfile is restarted
286 and it is incremented when the logfile is
287 closed at dismount time. Thus it is 0 when
288 dirty and 1 when clean. On WinXP and
289 presumably later, this is always 0. */
290/* 22*/ u8 reserved[6]; /* Reserved/alignment. */
291/* 28*/ le32 client_name_length;/* Length of client name in bytes. Should
292 always be 8. */
293/* 32*/ ntfschar client_name[64];/* Name of the client in Unicode. Should
294 always be "NTFS" with the remaining bytes
295 set to 0. */
296/* sizeof() = 160 (0xa0) bytes */
297} __attribute__ ((__packed__)) LOG_CLIENT_RECORD;
298
299extern BOOL ntfs_check_logfile(struct inode *log_vi);
300
301extern BOOL ntfs_is_logfile_clean(struct inode *log_vi);
302
303extern BOOL ntfs_empty_logfile(struct inode *log_vi);
304
305#endif /* NTFS_RW */
306
307#endif /* _LINUX_NTFS_LOGFILE_H */
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
new file mode 100644
index 000000000000..fac5944df6d8
--- /dev/null
+++ b/fs/ntfs/malloc.h
@@ -0,0 +1,62 @@
1/*
2 * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2001-2004 Anton Altaparmakov
5 *
6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published
8 * by the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program/include file is distributed in the hope that it will be
12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program (in the main directory of the Linux-NTFS
18 * distribution in the file COPYING); if not, write to the Free Software
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#ifndef _LINUX_NTFS_MALLOC_H
23#define _LINUX_NTFS_MALLOC_H
24
25#include <linux/vmalloc.h>
26#include <linux/slab.h>
27#include <linux/highmem.h>
28
29/**
30 * ntfs_malloc_nofs - allocate memory in multiples of pages
31 * @size number of bytes to allocate
32 *
33 * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
34 * returns a pointer to the allocated memory.
35 *
36 * If there was insufficient memory to complete the request, return NULL.
37 */
38static inline void *ntfs_malloc_nofs(unsigned long size)
39{
40 if (likely(size <= PAGE_SIZE)) {
41 BUG_ON(!size);
42 /* kmalloc() has per-CPU caches so is faster for now. */
43 return kmalloc(PAGE_SIZE, GFP_NOFS);
44 /* return (void *)__get_free_page(GFP_NOFS | __GFP_HIGHMEM); */
45 }
46 if (likely(size >> PAGE_SHIFT < num_physpages))
47 return __vmalloc(size, GFP_NOFS | __GFP_HIGHMEM, PAGE_KERNEL);
48 return NULL;
49}
50
51static inline void ntfs_free(void *addr)
52{
53 if (likely(((unsigned long)addr < VMALLOC_START) ||
54 ((unsigned long)addr >= VMALLOC_END ))) {
55 kfree(addr);
56 /* free_page((unsigned long)addr); */
57 return;
58 }
59 vfree(addr);
60}
61
62#endif /* _LINUX_NTFS_MALLOC_H */
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
new file mode 100644
index 000000000000..dfa85ac2f8ba
--- /dev/null
+++ b/fs/ntfs/mft.c
@@ -0,0 +1,2829 @@
1/**
2 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2001-2004 Anton Altaparmakov
5 * Copyright (c) 2002 Richard Russon
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#include <linux/buffer_head.h>
24#include <linux/swap.h>
25
26#include "attrib.h"
27#include "aops.h"
28#include "bitmap.h"
29#include "debug.h"
30#include "dir.h"
31#include "lcnalloc.h"
32#include "malloc.h"
33#include "mft.h"
34#include "ntfs.h"
35
36/**
37 * map_mft_record_page - map the page in which a specific mft record resides
38 * @ni: ntfs inode whose mft record page to map
39 *
40 * This maps the page in which the mft record of the ntfs inode @ni is situated
41 * and returns a pointer to the mft record within the mapped page.
42 *
43 * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
44 * contains the negative error code returned.
45 */
46static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
47{
48 ntfs_volume *vol = ni->vol;
49 struct inode *mft_vi = vol->mft_ino;
50 struct page *page;
51 unsigned long index, ofs, end_index;
52
53 BUG_ON(ni->page);
54 /*
55 * The index into the page cache and the offset within the page cache
56 * page of the wanted mft record. FIXME: We need to check for
57 * overflowing the unsigned long, but I don't think we would ever get
58 * here if the volume was that big...
59 */
60 index = ni->mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
61 ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
62
63 /* The maximum valid index into the page cache for $MFT's data. */
64 end_index = mft_vi->i_size >> PAGE_CACHE_SHIFT;
65
66 /* If the wanted index is out of bounds the mft record doesn't exist. */
67 if (unlikely(index >= end_index)) {
68 if (index > end_index || (mft_vi->i_size & ~PAGE_CACHE_MASK) <
69 ofs + vol->mft_record_size) {
70 page = ERR_PTR(-ENOENT);
71 ntfs_error(vol->sb, "Attemt to read mft record 0x%lx, "
72 "which is beyond the end of the mft. "
73 "This is probably a bug in the ntfs "
74 "driver.", ni->mft_no);
75 goto err_out;
76 }
77 }
78 /* Read, map, and pin the page. */
79 page = ntfs_map_page(mft_vi->i_mapping, index);
80 if (likely(!IS_ERR(page))) {
81 /* Catch multi sector transfer fixup errors. */
82 if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) +
83 ofs)))) {
84 ni->page = page;
85 ni->page_ofs = ofs;
86 return page_address(page) + ofs;
87 }
88 ntfs_error(vol->sb, "Mft record 0x%lx is corrupt. "
89 "Run chkdsk.", ni->mft_no);
90 ntfs_unmap_page(page);
91 page = ERR_PTR(-EIO);
92 }
93err_out:
94 ni->page = NULL;
95 ni->page_ofs = 0;
96 return (void*)page;
97}
98
99/**
100 * map_mft_record - map, pin and lock an mft record
101 * @ni: ntfs inode whose MFT record to map
102 *
103 * First, take the mrec_lock semaphore. We might now be sleeping, while waiting
104 * for the semaphore if it was already locked by someone else.
105 *
106 * The page of the record is mapped using map_mft_record_page() before being
107 * returned to the caller.
108 *
109 * This in turn uses ntfs_map_page() to get the page containing the wanted mft
110 * record (it in turn calls read_cache_page() which reads it in from disk if
111 * necessary, increments the use count on the page so that it cannot disappear
112 * under us and returns a reference to the page cache page).
113 *
114 * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
115 * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
116 * and the post-read mst fixups on each mft record in the page have been
117 * performed, the page gets PG_uptodate set and PG_locked cleared (this is done
118 * in our asynchronous I/O completion handler end_buffer_read_mft_async()).
119 * ntfs_map_page() waits for PG_locked to become clear and checks if
120 * PG_uptodate is set and returns an error code if not. This provides
121 * sufficient protection against races when reading/using the page.
122 *
123 * However there is the write mapping to think about. Doing the above described
124 * checking here will be fine, because when initiating the write we will set
125 * PG_locked and clear PG_uptodate making sure nobody is touching the page
126 * contents. Doing the locking this way means that the commit to disk code in
127 * the page cache code paths is automatically sufficiently locked with us as
128 * we will not touch a page that has been locked or is not uptodate. The only
129 * locking problem then is them locking the page while we are accessing it.
130 *
131 * So that code will end up having to own the mrec_lock of all mft
132 * records/inodes present in the page before I/O can proceed. In that case we
133 * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
134 * accessing anything without owning the mrec_lock semaphore. But we do need
135 * to use them because of the read_cache_page() invocation and the code becomes
136 * so much simpler this way that it is well worth it.
137 *
138 * The mft record is now ours and we return a pointer to it. You need to check
139 * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
140 * the error code.
141 *
142 * NOTE: Caller is responsible for setting the mft record dirty before calling
143 * unmap_mft_record(). This is obviously only necessary if the caller really
144 * modified the mft record...
145 * Q: Do we want to recycle one of the VFS inode state bits instead?
146 * A: No, the inode ones mean we want to change the mft record, not we want to
147 * write it out.
148 */
149MFT_RECORD *map_mft_record(ntfs_inode *ni)
150{
151 MFT_RECORD *m;
152
153 ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
154
155 /* Make sure the ntfs inode doesn't go away. */
156 atomic_inc(&ni->count);
157
158 /* Serialize access to this mft record. */
159 down(&ni->mrec_lock);
160
161 m = map_mft_record_page(ni);
162 if (likely(!IS_ERR(m)))
163 return m;
164
165 up(&ni->mrec_lock);
166 atomic_dec(&ni->count);
167 ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
168 return m;
169}
170
171/**
172 * unmap_mft_record_page - unmap the page in which a specific mft record resides
173 * @ni: ntfs inode whose mft record page to unmap
174 *
175 * This unmaps the page in which the mft record of the ntfs inode @ni is
176 * situated and returns. This is a NOOP if highmem is not configured.
177 *
178 * The unmap happens via ntfs_unmap_page() which in turn decrements the use
179 * count on the page thus releasing it from the pinned state.
180 *
181 * We do not actually unmap the page from memory of course, as that will be
182 * done by the page cache code itself when memory pressure increases or
183 * whatever.
184 */
185static inline void unmap_mft_record_page(ntfs_inode *ni)
186{
187 BUG_ON(!ni->page);
188
189 // TODO: If dirty, blah...
190 ntfs_unmap_page(ni->page);
191 ni->page = NULL;
192 ni->page_ofs = 0;
193 return;
194}
195
196/**
197 * unmap_mft_record - release a mapped mft record
198 * @ni: ntfs inode whose MFT record to unmap
199 *
200 * We release the page mapping and the mrec_lock mutex which unmaps the mft
201 * record and releases it for others to get hold of. We also release the ntfs
202 * inode by decrementing the ntfs inode reference count.
203 *
204 * NOTE: If caller has modified the mft record, it is imperative to set the mft
205 * record dirty BEFORE calling unmap_mft_record().
206 */
207void unmap_mft_record(ntfs_inode *ni)
208{
209 struct page *page = ni->page;
210
211 BUG_ON(!page);
212
213 ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
214
215 unmap_mft_record_page(ni);
216 up(&ni->mrec_lock);
217 atomic_dec(&ni->count);
218 /*
219 * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
220 * ntfs_clear_extent_inode() in the extent inode case, and to the
221 * caller in the non-extent, yet pure ntfs inode case, to do the actual
222 * tear down of all structures and freeing of all allocated memory.
223 */
224 return;
225}
226
227/**
228 * map_extent_mft_record - load an extent inode and attach it to its base
229 * @base_ni: base ntfs inode
230 * @mref: mft reference of the extent inode to load
231 * @ntfs_ino: on successful return, pointer to the ntfs_inode structure
232 *
233 * Load the extent mft record @mref and attach it to its base inode @base_ni.
234 * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise
235 * PTR_ERR(result) gives the negative error code.
236 *
237 * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
238 * structure of the mapped extent inode.
239 */
240MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
241 ntfs_inode **ntfs_ino)
242{
243 MFT_RECORD *m;
244 ntfs_inode *ni = NULL;
245 ntfs_inode **extent_nis = NULL;
246 int i;
247 unsigned long mft_no = MREF(mref);
248 u16 seq_no = MSEQNO(mref);
249 BOOL destroy_ni = FALSE;
250
251 ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
252 mft_no, base_ni->mft_no);
253 /* Make sure the base ntfs inode doesn't go away. */
254 atomic_inc(&base_ni->count);
255 /*
256 * Check if this extent inode has already been added to the base inode,
257 * in which case just return it. If not found, add it to the base
258 * inode before returning it.
259 */
260 down(&base_ni->extent_lock);
261 if (base_ni->nr_extents > 0) {
262 extent_nis = base_ni->ext.extent_ntfs_inos;
263 for (i = 0; i < base_ni->nr_extents; i++) {
264 if (mft_no != extent_nis[i]->mft_no)
265 continue;
266 ni = extent_nis[i];
267 /* Make sure the ntfs inode doesn't go away. */
268 atomic_inc(&ni->count);
269 break;
270 }
271 }
272 if (likely(ni != NULL)) {
273 up(&base_ni->extent_lock);
274 atomic_dec(&base_ni->count);
275 /* We found the record; just have to map and return it. */
276 m = map_mft_record(ni);
277 /* map_mft_record() has incremented this on success. */
278 atomic_dec(&ni->count);
279 if (likely(!IS_ERR(m))) {
280 /* Verify the sequence number. */
281 if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
282 ntfs_debug("Done 1.");
283 *ntfs_ino = ni;
284 return m;
285 }
286 unmap_mft_record(ni);
287 ntfs_error(base_ni->vol->sb, "Found stale extent mft "
288 "reference! Corrupt file system. "
289 "Run chkdsk.");
290 return ERR_PTR(-EIO);
291 }
292map_err_out:
293 ntfs_error(base_ni->vol->sb, "Failed to map extent "
294 "mft record, error code %ld.", -PTR_ERR(m));
295 return m;
296 }
297 /* Record wasn't there. Get a new ntfs inode and initialize it. */
298 ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
299 if (unlikely(!ni)) {
300 up(&base_ni->extent_lock);
301 atomic_dec(&base_ni->count);
302 return ERR_PTR(-ENOMEM);
303 }
304 ni->vol = base_ni->vol;
305 ni->seq_no = seq_no;
306 ni->nr_extents = -1;
307 ni->ext.base_ntfs_ino = base_ni;
308 /* Now map the record. */
309 m = map_mft_record(ni);
310 if (IS_ERR(m)) {
311 up(&base_ni->extent_lock);
312 atomic_dec(&base_ni->count);
313 ntfs_clear_extent_inode(ni);
314 goto map_err_out;
315 }
316 /* Verify the sequence number if it is present. */
317 if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
318 ntfs_error(base_ni->vol->sb, "Found stale extent mft "
319 "reference! Corrupt file system. Run chkdsk.");
320 destroy_ni = TRUE;
321 m = ERR_PTR(-EIO);
322 goto unm_err_out;
323 }
324 /* Attach extent inode to base inode, reallocating memory if needed. */
325 if (!(base_ni->nr_extents & 3)) {
326 ntfs_inode **tmp;
327 int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
328
329 tmp = (ntfs_inode **)kmalloc(new_size, GFP_NOFS);
330 if (unlikely(!tmp)) {
331 ntfs_error(base_ni->vol->sb, "Failed to allocate "
332 "internal buffer.");
333 destroy_ni = TRUE;
334 m = ERR_PTR(-ENOMEM);
335 goto unm_err_out;
336 }
337 if (base_ni->nr_extents) {
338 BUG_ON(!base_ni->ext.extent_ntfs_inos);
339 memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
340 4 * sizeof(ntfs_inode *));
341 kfree(base_ni->ext.extent_ntfs_inos);
342 }
343 base_ni->ext.extent_ntfs_inos = tmp;
344 }
345 base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
346 up(&base_ni->extent_lock);
347 atomic_dec(&base_ni->count);
348 ntfs_debug("Done 2.");
349 *ntfs_ino = ni;
350 return m;
351unm_err_out:
352 unmap_mft_record(ni);
353 up(&base_ni->extent_lock);
354 atomic_dec(&base_ni->count);
355 /*
356 * If the extent inode was not attached to the base inode we need to
357 * release it or we will leak memory.
358 */
359 if (destroy_ni)
360 ntfs_clear_extent_inode(ni);
361 return m;
362}
363
364#ifdef NTFS_RW
365
366/**
367 * __mark_mft_record_dirty - set the mft record and the page containing it dirty
368 * @ni: ntfs inode describing the mapped mft record
369 *
370 * Internal function. Users should call mark_mft_record_dirty() instead.
371 *
372 * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
373 * as well as the page containing the mft record, dirty. Also, mark the base
374 * vfs inode dirty. This ensures that any changes to the mft record are
375 * written out to disk.
376 *
377 * NOTE: We only set I_DIRTY_SYNC and I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
378 * on the base vfs inode, because even though file data may have been modified,
379 * it is dirty in the inode meta data rather than the data page cache of the
380 * inode, and thus there are no data pages that need writing out. Therefore, a
381 * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the
382 * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to
383 * ensure ->write_inode is called from generic_osync_inode() and this needs to
384 * happen or the file data would not necessarily hit the device synchronously,
385 * even though the vfs inode has the O_SYNC flag set. Also, I_DIRTY_DATASYNC
386 * simply "feels" better than just I_DIRTY_SYNC, since the file data has not
387 * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own
388 * would suggest.
389 */
390void __mark_mft_record_dirty(ntfs_inode *ni)
391{
392 ntfs_inode *base_ni;
393
394 ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
395 BUG_ON(NInoAttr(ni));
396 mark_ntfs_record_dirty(ni->page, ni->page_ofs);
397 /* Determine the base vfs inode and mark it dirty, too. */
398 down(&ni->extent_lock);
399 if (likely(ni->nr_extents >= 0))
400 base_ni = ni;
401 else
402 base_ni = ni->ext.base_ntfs_ino;
403 up(&ni->extent_lock);
404 __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_SYNC | I_DIRTY_DATASYNC);
405}
406
407static const char *ntfs_please_email = "Please email "
408 "linux-ntfs-dev@lists.sourceforge.net and say that you saw "
409 "this message. Thank you.";
410
411/**
412 * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror
413 * @vol: ntfs volume on which the mft record to synchronize resides
414 * @mft_no: mft record number of mft record to synchronize
415 * @m: mapped, mst protected (extent) mft record to synchronize
416 *
417 * Write the mapped, mst protected (extent) mft record @m with mft record
418 * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol,
419 * bypassing the page cache and the $MFTMirr inode itself.
420 *
421 * This function is only for use at umount time when the mft mirror inode has
422 * already been disposed off. We BUG() if we are called while the mft mirror
423 * inode is still attached to the volume.
424 *
425 * On success return 0. On error return -errno.
426 *
427 * NOTE: This function is not implemented yet as I am not convinced it can
428 * actually be triggered considering the sequence of commits we do in super.c::
429 * ntfs_put_super(). But just in case we provide this place holder as the
430 * alternative would be either to BUG() or to get a NULL pointer dereference
431 * and Oops.
432 */
433static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol,
434 const unsigned long mft_no, MFT_RECORD *m)
435{
436 BUG_ON(vol->mftmirr_ino);
437 ntfs_error(vol->sb, "Umount time mft mirror syncing is not "
438 "implemented yet. %s", ntfs_please_email);
439 return -EOPNOTSUPP;
440}
441
442/**
443 * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror
444 * @vol: ntfs volume on which the mft record to synchronize resides
445 * @mft_no: mft record number of mft record to synchronize
446 * @m: mapped, mst protected (extent) mft record to synchronize
447 * @sync: if true, wait for i/o completion
448 *
449 * Write the mapped, mst protected (extent) mft record @m with mft record
450 * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol.
451 *
452 * On success return 0. On error return -errno and set the volume errors flag
453 * in the ntfs volume @vol.
454 *
455 * NOTE: We always perform synchronous i/o and ignore the @sync parameter.
456 *
457 * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just
458 * schedule i/o via ->writepage or do it via kntfsd or whatever.
459 */
460int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
461 MFT_RECORD *m, int sync)
462{
463 struct page *page;
464 unsigned int blocksize = vol->sb->s_blocksize;
465 int max_bhs = vol->mft_record_size / blocksize;
466 struct buffer_head *bhs[max_bhs];
467 struct buffer_head *bh, *head;
468 u8 *kmirr;
469 runlist_element *rl;
470 unsigned int block_start, block_end, m_start, m_end, page_ofs;
471 int i_bhs, nr_bhs, err = 0;
472 unsigned char blocksize_bits = vol->mftmirr_ino->i_blkbits;
473
474 ntfs_debug("Entering for inode 0x%lx.", mft_no);
475 BUG_ON(!max_bhs);
476 if (unlikely(!vol->mftmirr_ino)) {
477 /* This could happen during umount... */
478 err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
479 if (likely(!err))
480 return err;
481 goto err_out;
482 }
483 /* Get the page containing the mirror copy of the mft record @m. */
484 page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >>
485 (PAGE_CACHE_SHIFT - vol->mft_record_size_bits));
486 if (IS_ERR(page)) {
487 ntfs_error(vol->sb, "Failed to map mft mirror page.");
488 err = PTR_ERR(page);
489 goto err_out;
490 }
491 lock_page(page);
492 BUG_ON(!PageUptodate(page));
493 ClearPageUptodate(page);
494 /* Offset of the mft mirror record inside the page. */
495 page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
496 /* The address in the page of the mirror copy of the mft record @m. */
497 kmirr = page_address(page) + page_ofs;
498 /* Copy the mst protected mft record to the mirror. */
499 memcpy(kmirr, m, vol->mft_record_size);
500 /* Create uptodate buffers if not present. */
501 if (unlikely(!page_has_buffers(page))) {
502 struct buffer_head *tail;
503
504 bh = head = alloc_page_buffers(page, blocksize, 1);
505 do {
506 set_buffer_uptodate(bh);
507 tail = bh;
508 bh = bh->b_this_page;
509 } while (bh);
510 tail->b_this_page = head;
511 attach_page_buffers(page, head);
512 BUG_ON(!page_has_buffers(page));
513 }
514 bh = head = page_buffers(page);
515 BUG_ON(!bh);
516 rl = NULL;
517 nr_bhs = 0;
518 block_start = 0;
519 m_start = kmirr - (u8*)page_address(page);
520 m_end = m_start + vol->mft_record_size;
521 do {
522 block_end = block_start + blocksize;
523 /* If the buffer is outside the mft record, skip it. */
524 if (block_end <= m_start)
525 continue;
526 if (unlikely(block_start >= m_end))
527 break;
528 /* Need to map the buffer if it is not mapped already. */
529 if (unlikely(!buffer_mapped(bh))) {
530 VCN vcn;
531 LCN lcn;
532 unsigned int vcn_ofs;
533
534 /* Obtain the vcn and offset of the current block. */
535 vcn = ((VCN)mft_no << vol->mft_record_size_bits) +
536 (block_start - m_start);
537 vcn_ofs = vcn & vol->cluster_size_mask;
538 vcn >>= vol->cluster_size_bits;
539 if (!rl) {
540 down_read(&NTFS_I(vol->mftmirr_ino)->
541 runlist.lock);
542 rl = NTFS_I(vol->mftmirr_ino)->runlist.rl;
543 /*
544 * $MFTMirr always has the whole of its runlist
545 * in memory.
546 */
547 BUG_ON(!rl);
548 }
549 /* Seek to element containing target vcn. */
550 while (rl->length && rl[1].vcn <= vcn)
551 rl++;
552 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
553 /* For $MFTMirr, only lcn >= 0 is a successful remap. */
554 if (likely(lcn >= 0)) {
555 /* Setup buffer head to correct block. */
556 bh->b_blocknr = ((lcn <<
557 vol->cluster_size_bits) +
558 vcn_ofs) >> blocksize_bits;
559 set_buffer_mapped(bh);
560 } else {
561 bh->b_blocknr = -1;
562 ntfs_error(vol->sb, "Cannot write mft mirror "
563 "record 0x%lx because its "
564 "location on disk could not "
565 "be determined (error code "
566 "%lli).", mft_no,
567 (long long)lcn);
568 err = -EIO;
569 }
570 }
571 BUG_ON(!buffer_uptodate(bh));
572 BUG_ON(!nr_bhs && (m_start != block_start));
573 BUG_ON(nr_bhs >= max_bhs);
574 bhs[nr_bhs++] = bh;
575 BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
576 } while (block_start = block_end, (bh = bh->b_this_page) != head);
577 if (unlikely(rl))
578 up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock);
579 if (likely(!err)) {
580 /* Lock buffers and start synchronous write i/o on them. */
581 for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
582 struct buffer_head *tbh = bhs[i_bhs];
583
584 if (unlikely(test_set_buffer_locked(tbh)))
585 BUG();
586 BUG_ON(!buffer_uptodate(tbh));
587 clear_buffer_dirty(tbh);
588 get_bh(tbh);
589 tbh->b_end_io = end_buffer_write_sync;
590 submit_bh(WRITE, tbh);
591 }
592 /* Wait on i/o completion of buffers. */
593 for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
594 struct buffer_head *tbh = bhs[i_bhs];
595
596 wait_on_buffer(tbh);
597 if (unlikely(!buffer_uptodate(tbh))) {
598 err = -EIO;
599 /*
600 * Set the buffer uptodate so the page and
601 * buffer states do not become out of sync.
602 */
603 set_buffer_uptodate(tbh);
604 }
605 }
606 } else /* if (unlikely(err)) */ {
607 /* Clean the buffers. */
608 for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
609 clear_buffer_dirty(bhs[i_bhs]);
610 }
611 /* Current state: all buffers are clean, unlocked, and uptodate. */
612 /* Remove the mst protection fixups again. */
613 post_write_mst_fixup((NTFS_RECORD*)kmirr);
614 flush_dcache_page(page);
615 SetPageUptodate(page);
616 unlock_page(page);
617 ntfs_unmap_page(page);
618 if (likely(!err)) {
619 ntfs_debug("Done.");
620 } else {
621 ntfs_error(vol->sb, "I/O error while writing mft mirror "
622 "record 0x%lx!", mft_no);
623err_out:
624 ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error "
625 "code %i). Volume will be left marked dirty "
626 "on umount. Run ntfsfix on the partition "
627 "after umounting to correct this.", -err);
628 NVolSetErrors(vol);
629 }
630 return err;
631}
632
633/**
634 * write_mft_record_nolock - write out a mapped (extent) mft record
635 * @ni: ntfs inode describing the mapped (extent) mft record
636 * @m: mapped (extent) mft record to write
637 * @sync: if true, wait for i/o completion
638 *
639 * Write the mapped (extent) mft record @m described by the (regular or extent)
640 * ntfs inode @ni to backing store. If the mft record @m has a counterpart in
641 * the mft mirror, that is also updated.
642 *
643 * We only write the mft record if the ntfs inode @ni is dirty and the first
644 * buffer belonging to its mft record is dirty, too. We ignore the dirty state
645 * of subsequent buffers because we could have raced with
646 * fs/ntfs/aops.c::mark_ntfs_record_dirty().
647 *
648 * On success, clean the mft record and return 0. On error, leave the mft
649 * record dirty and return -errno. The caller should call make_bad_inode() on
650 * the base inode to ensure no more access happens to this inode. We do not do
651 * it here as the caller may want to finish writing other extent mft records
652 * first to minimize on-disk metadata inconsistencies.
653 *
654 * NOTE: We always perform synchronous i/o and ignore the @sync parameter.
655 * However, if the mft record has a counterpart in the mft mirror and @sync is
656 * true, we write the mft record, wait for i/o completion, and only then write
657 * the mft mirror copy. This ensures that if the system crashes either the mft
658 * or the mft mirror will contain a self-consistent mft record @m. If @sync is
659 * false on the other hand, we start i/o on both and then wait for completion
660 * on them. This provides a speedup but no longer guarantees that you will end
661 * up with a self-consistent mft record in the case of a crash but if you asked
662 * for asynchronous writing you probably do not care about that anyway.
663 *
664 * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just
665 * schedule i/o via ->writepage or do it via kntfsd or whatever.
666 */
667int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
668{
669 ntfs_volume *vol = ni->vol;
670 struct page *page = ni->page;
671 unsigned char blocksize_bits = vol->mft_ino->i_blkbits;
672 unsigned int blocksize = 1 << blocksize_bits;
673 int max_bhs = vol->mft_record_size / blocksize;
674 struct buffer_head *bhs[max_bhs];
675 struct buffer_head *bh, *head;
676 runlist_element *rl;
677 unsigned int block_start, block_end, m_start, m_end;
678 int i_bhs, nr_bhs, err = 0;
679
680 ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
681 BUG_ON(NInoAttr(ni));
682 BUG_ON(!max_bhs);
683 BUG_ON(!PageLocked(page));
684 /*
685 * If the ntfs_inode is clean no need to do anything. If it is dirty,
686 * mark it as clean now so that it can be redirtied later on if needed.
687 * There is no danger of races since the caller is holding the locks
688 * for the mft record @m and the page it is in.
689 */
690 if (!NInoTestClearDirty(ni))
691 goto done;
692 BUG_ON(!page_has_buffers(page));
693 bh = head = page_buffers(page);
694 BUG_ON(!bh);
695 rl = NULL;
696 nr_bhs = 0;
697 block_start = 0;
698 m_start = ni->page_ofs;
699 m_end = m_start + vol->mft_record_size;
700 do {
701 block_end = block_start + blocksize;
702 /* If the buffer is outside the mft record, skip it. */
703 if (block_end <= m_start)
704 continue;
705 if (unlikely(block_start >= m_end))
706 break;
707 /*
708 * If this block is not the first one in the record, we ignore
709 * the buffer's dirty state because we could have raced with a
710 * parallel mark_ntfs_record_dirty().
711 */
712 if (block_start == m_start) {
713 /* This block is the first one in the record. */
714 if (!buffer_dirty(bh)) {
715 BUG_ON(nr_bhs);
716 /* Clean records are not written out. */
717 break;
718 }
719 }
720 /* Need to map the buffer if it is not mapped already. */
721 if (unlikely(!buffer_mapped(bh))) {
722 VCN vcn;
723 LCN lcn;
724 unsigned int vcn_ofs;
725
726 /* Obtain the vcn and offset of the current block. */
727 vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) +
728 (block_start - m_start);
729 vcn_ofs = vcn & vol->cluster_size_mask;
730 vcn >>= vol->cluster_size_bits;
731 if (!rl) {
732 down_read(&NTFS_I(vol->mft_ino)->runlist.lock);
733 rl = NTFS_I(vol->mft_ino)->runlist.rl;
734 BUG_ON(!rl);
735 }
736 /* Seek to element containing target vcn. */
737 while (rl->length && rl[1].vcn <= vcn)
738 rl++;
739 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
740 /* For $MFT, only lcn >= 0 is a successful remap. */
741 if (likely(lcn >= 0)) {
742 /* Setup buffer head to correct block. */
743 bh->b_blocknr = ((lcn <<
744 vol->cluster_size_bits) +
745 vcn_ofs) >> blocksize_bits;
746 set_buffer_mapped(bh);
747 } else {
748 bh->b_blocknr = -1;
749 ntfs_error(vol->sb, "Cannot write mft record "
750 "0x%lx because its location "
751 "on disk could not be "
752 "determined (error code %lli).",
753 ni->mft_no, (long long)lcn);
754 err = -EIO;
755 }
756 }
757 BUG_ON(!buffer_uptodate(bh));
758 BUG_ON(!nr_bhs && (m_start != block_start));
759 BUG_ON(nr_bhs >= max_bhs);
760 bhs[nr_bhs++] = bh;
761 BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
762 } while (block_start = block_end, (bh = bh->b_this_page) != head);
763 if (unlikely(rl))
764 up_read(&NTFS_I(vol->mft_ino)->runlist.lock);
765 if (!nr_bhs)
766 goto done;
767 if (unlikely(err))
768 goto cleanup_out;
769 /* Apply the mst protection fixups. */
770 err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size);
771 if (err) {
772 ntfs_error(vol->sb, "Failed to apply mst fixups!");
773 goto cleanup_out;
774 }
775 flush_dcache_mft_record_page(ni);
776 /* Lock buffers and start synchronous write i/o on them. */
777 for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
778 struct buffer_head *tbh = bhs[i_bhs];
779
780 if (unlikely(test_set_buffer_locked(tbh)))
781 BUG();
782 BUG_ON(!buffer_uptodate(tbh));
783 clear_buffer_dirty(tbh);
784 get_bh(tbh);
785 tbh->b_end_io = end_buffer_write_sync;
786 submit_bh(WRITE, tbh);
787 }
788 /* Synchronize the mft mirror now if not @sync. */
789 if (!sync && ni->mft_no < vol->mftmirr_size)
790 ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
791 /* Wait on i/o completion of buffers. */
792 for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
793 struct buffer_head *tbh = bhs[i_bhs];
794
795 wait_on_buffer(tbh);
796 if (unlikely(!buffer_uptodate(tbh))) {
797 err = -EIO;
798 /*
799 * Set the buffer uptodate so the page and buffer
800 * states do not become out of sync.
801 */
802 if (PageUptodate(page))
803 set_buffer_uptodate(tbh);
804 }
805 }
806 /* If @sync, now synchronize the mft mirror. */
807 if (sync && ni->mft_no < vol->mftmirr_size)
808 ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
809 /* Remove the mst protection fixups again. */
810 post_write_mst_fixup((NTFS_RECORD*)m);
811 flush_dcache_mft_record_page(ni);
812 if (unlikely(err)) {
813 /* I/O error during writing. This is really bad! */
814 ntfs_error(vol->sb, "I/O error while writing mft record "
815 "0x%lx! Marking base inode as bad. You "
816 "should unmount the volume and run chkdsk.",
817 ni->mft_no);
818 goto err_out;
819 }
820done:
821 ntfs_debug("Done.");
822 return 0;
823cleanup_out:
824 /* Clean the buffers. */
825 for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
826 clear_buffer_dirty(bhs[i_bhs]);
827err_out:
828 /*
829 * Current state: all buffers are clean, unlocked, and uptodate.
830 * The caller should mark the base inode as bad so that no more i/o
831 * happens. ->clear_inode() will still be invoked so all extent inodes
832 * and other allocated memory will be freed.
833 */
834 if (err == -ENOMEM) {
835 ntfs_error(vol->sb, "Not enough memory to write mft record. "
836 "Redirtying so the write is retried later.");
837 mark_mft_record_dirty(ni);
838 err = 0;
839 } else
840 NVolSetErrors(vol);
841 return err;
842}
843
844/**
845 * ntfs_may_write_mft_record - check if an mft record may be written out
846 * @vol: [IN] ntfs volume on which the mft record to check resides
847 * @mft_no: [IN] mft record number of the mft record to check
848 * @m: [IN] mapped mft record to check
849 * @locked_ni: [OUT] caller has to unlock this ntfs inode if one is returned
850 *
851 * Check if the mapped (base or extent) mft record @m with mft record number
852 * @mft_no belonging to the ntfs volume @vol may be written out. If necessary
853 * and possible the ntfs inode of the mft record is locked and the base vfs
854 * inode is pinned. The locked ntfs inode is then returned in @locked_ni. The
855 * caller is responsible for unlocking the ntfs inode and unpinning the base
856 * vfs inode.
857 *
858 * Return TRUE if the mft record may be written out and FALSE if not.
859 *
860 * The caller has locked the page and cleared the uptodate flag on it which
861 * means that we can safely write out any dirty mft records that do not have
862 * their inodes in icache as determined by ilookup5() as anyone
863 * opening/creating such an inode would block when attempting to map the mft
864 * record in read_cache_page() until we are finished with the write out.
865 *
866 * Here is a description of the tests we perform:
867 *
868 * If the inode is found in icache we know the mft record must be a base mft
869 * record. If it is dirty, we do not write it and return FALSE as the vfs
870 * inode write paths will result in the access times being updated which would
871 * cause the base mft record to be redirtied and written out again. (We know
872 * the access time update will modify the base mft record because Windows
873 * chkdsk complains if the standard information attribute is not in the base
874 * mft record.)
875 *
876 * If the inode is in icache and not dirty, we attempt to lock the mft record
877 * and if we find the lock was already taken, it is not safe to write the mft
878 * record and we return FALSE.
879 *
880 * If we manage to obtain the lock we have exclusive access to the mft record,
881 * which also allows us safe writeout of the mft record. We then set
882 * @locked_ni to the locked ntfs inode and return TRUE.
883 *
884 * Note we cannot just lock the mft record and sleep while waiting for the lock
885 * because this would deadlock due to lock reversal (normally the mft record is
886 * locked before the page is locked but we already have the page locked here
887 * when we try to lock the mft record).
888 *
889 * If the inode is not in icache we need to perform further checks.
890 *
891 * If the mft record is not a FILE record or it is a base mft record, we can
892 * safely write it and return TRUE.
893 *
894 * We now know the mft record is an extent mft record. We check if the inode
895 * corresponding to its base mft record is in icache and obtain a reference to
896 * it if it is. If it is not, we can safely write it and return TRUE.
897 *
898 * We now have the base inode for the extent mft record. We check if it has an
899 * ntfs inode for the extent mft record attached and if not it is safe to write
900 * the extent mft record and we return TRUE.
901 *
902 * The ntfs inode for the extent mft record is attached to the base inode so we
903 * attempt to lock the extent mft record and if we find the lock was already
904 * taken, it is not safe to write the extent mft record and we return FALSE.
905 *
906 * If we manage to obtain the lock we have exclusive access to the extent mft
907 * record, which also allows us safe writeout of the extent mft record. We
908 * set the ntfs inode of the extent mft record clean and then set @locked_ni to
909 * the now locked ntfs inode and return TRUE.
910 *
911 * Note, the reason for actually writing dirty mft records here and not just
912 * relying on the vfs inode dirty code paths is that we can have mft records
913 * modified without them ever having actual inodes in memory. Also we can have
914 * dirty mft records with clean ntfs inodes in memory. None of the described
915 * cases would result in the dirty mft records being written out if we only
916 * relied on the vfs inode dirty code paths. And these cases can really occur
917 * during allocation of new mft records and in particular when the
918 * initialized_size of the $MFT/$DATA attribute is extended and the new space
919 * is initialized using ntfs_mft_record_format(). The clean inode can then
920 * appear if the mft record is reused for a new inode before it got written
921 * out.
922 */
923BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
924 const MFT_RECORD *m, ntfs_inode **locked_ni)
925{
926 struct super_block *sb = vol->sb;
927 struct inode *mft_vi = vol->mft_ino;
928 struct inode *vi;
929 ntfs_inode *ni, *eni, **extent_nis;
930 int i;
931 ntfs_attr na;
932
933 ntfs_debug("Entering for inode 0x%lx.", mft_no);
934 /*
935 * Normally we do not return a locked inode so set @locked_ni to NULL.
936 */
937 BUG_ON(!locked_ni);
938 *locked_ni = NULL;
939 /*
940 * Check if the inode corresponding to this mft record is in the VFS
941 * inode cache and obtain a reference to it if it is.
942 */
943 ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
944 na.mft_no = mft_no;
945 na.name = NULL;
946 na.name_len = 0;
947 na.type = AT_UNUSED;
948 /*
949 * For inode 0, i.e. $MFT itself, we cannot use ilookup5() from here or
950 * we deadlock because the inode is already locked by the kernel
951 * (fs/fs-writeback.c::__sync_single_inode()) and ilookup5() waits
952 * until the inode is unlocked before returning it and it never gets
953 * unlocked because ntfs_should_write_mft_record() never returns. )-:
954 * Fortunately, we have inode 0 pinned in icache for the duration of
955 * the mount so we can access it directly.
956 */
957 if (!mft_no) {
958 /* Balance the below iput(). */
959 vi = igrab(mft_vi);
960 BUG_ON(vi != mft_vi);
961 } else
962 vi = ilookup5(sb, mft_no, (test_t)ntfs_test_inode, &na);
963 if (vi) {
964 ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
965 /* The inode is in icache. */
966 ni = NTFS_I(vi);
967 /* Take a reference to the ntfs inode. */
968 atomic_inc(&ni->count);
969 /* If the inode is dirty, do not write this record. */
970 if (NInoDirty(ni)) {
971 ntfs_debug("Inode 0x%lx is dirty, do not write it.",
972 mft_no);
973 atomic_dec(&ni->count);
974 iput(vi);
975 return FALSE;
976 }
977 ntfs_debug("Inode 0x%lx is not dirty.", mft_no);
978 /* The inode is not dirty, try to take the mft record lock. */
979 if (unlikely(down_trylock(&ni->mrec_lock))) {
980 ntfs_debug("Mft record 0x%lx is already locked, do "
981 "not write it.", mft_no);
982 atomic_dec(&ni->count);
983 iput(vi);
984 return FALSE;
985 }
986 ntfs_debug("Managed to lock mft record 0x%lx, write it.",
987 mft_no);
988 /*
989 * The write has to occur while we hold the mft record lock so
990 * return the locked ntfs inode.
991 */
992 *locked_ni = ni;
993 return TRUE;
994 }
995 ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
996 /* The inode is not in icache. */
997 /* Write the record if it is not a mft record (type "FILE"). */
998 if (!ntfs_is_mft_record(m->magic)) {
999 ntfs_debug("Mft record 0x%lx is not a FILE record, write it.",
1000 mft_no);
1001 return TRUE;
1002 }
1003 /* Write the mft record if it is a base inode. */
1004 if (!m->base_mft_record) {
1005 ntfs_debug("Mft record 0x%lx is a base record, write it.",
1006 mft_no);
1007 return TRUE;
1008 }
1009 /*
1010 * This is an extent mft record. Check if the inode corresponding to
1011 * its base mft record is in icache and obtain a reference to it if it
1012 * is.
1013 */
1014 na.mft_no = MREF_LE(m->base_mft_record);
1015 ntfs_debug("Mft record 0x%lx is an extent record. Looking for base "
1016 "inode 0x%lx in icache.", mft_no, na.mft_no);
1017 vi = ilookup5(sb, na.mft_no, (test_t)ntfs_test_inode, &na);
1018 if (!vi) {
1019 /*
1020 * The base inode is not in icache, write this extent mft
1021 * record.
1022 */
1023 ntfs_debug("Base inode 0x%lx is not in icache, write the "
1024 "extent record.", na.mft_no);
1025 return TRUE;
1026 }
1027 ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
1028 /*
1029 * The base inode is in icache. Check if it has the extent inode
1030 * corresponding to this extent mft record attached.
1031 */
1032 ni = NTFS_I(vi);
1033 down(&ni->extent_lock);
1034 if (ni->nr_extents <= 0) {
1035 /*
1036 * The base inode has no attached extent inodes, write this
1037 * extent mft record.
1038 */
1039 up(&ni->extent_lock);
1040 iput(vi);
1041 ntfs_debug("Base inode 0x%lx has no attached extent inodes, "
1042 "write the extent record.", na.mft_no);
1043 return TRUE;
1044 }
1045 /* Iterate over the attached extent inodes. */
1046 extent_nis = ni->ext.extent_ntfs_inos;
1047 for (eni = NULL, i = 0; i < ni->nr_extents; ++i) {
1048 if (mft_no == extent_nis[i]->mft_no) {
1049 /*
1050 * Found the extent inode corresponding to this extent
1051 * mft record.
1052 */
1053 eni = extent_nis[i];
1054 break;
1055 }
1056 }
1057 /*
1058 * If the extent inode was not attached to the base inode, write this
1059 * extent mft record.
1060 */
1061 if (!eni) {
1062 up(&ni->extent_lock);
1063 iput(vi);
1064 ntfs_debug("Extent inode 0x%lx is not attached to its base "
1065 "inode 0x%lx, write the extent record.",
1066 mft_no, na.mft_no);
1067 return TRUE;
1068 }
1069 ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.",
1070 mft_no, na.mft_no);
1071 /* Take a reference to the extent ntfs inode. */
1072 atomic_inc(&eni->count);
1073 up(&ni->extent_lock);
1074 /*
1075 * Found the extent inode coresponding to this extent mft record.
1076 * Try to take the mft record lock.
1077 */
1078 if (unlikely(down_trylock(&eni->mrec_lock))) {
1079 atomic_dec(&eni->count);
1080 iput(vi);
1081 ntfs_debug("Extent mft record 0x%lx is already locked, do "
1082 "not write it.", mft_no);
1083 return FALSE;
1084 }
1085 ntfs_debug("Managed to lock extent mft record 0x%lx, write it.",
1086 mft_no);
1087 if (NInoTestClearDirty(eni))
1088 ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.",
1089 mft_no);
1090 /*
1091 * The write has to occur while we hold the mft record lock so return
1092 * the locked extent ntfs inode.
1093 */
1094 *locked_ni = eni;
1095 return TRUE;
1096}
1097
1098static const char *es = " Leaving inconsistent metadata. Unmount and run "
1099 "chkdsk.";
1100
1101/**
1102 * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name
1103 * @vol: volume on which to search for a free mft record
1104 * @base_ni: open base inode if allocating an extent mft record or NULL
1105 *
1106 * Search for a free mft record in the mft bitmap attribute on the ntfs volume
1107 * @vol.
1108 *
1109 * If @base_ni is NULL start the search at the default allocator position.
1110 *
1111 * If @base_ni is not NULL start the search at the mft record after the base
1112 * mft record @base_ni.
1113 *
1114 * Return the free mft record on success and -errno on error. An error code of
1115 * -ENOSPC means that there are no free mft records in the currently
1116 * initialized mft bitmap.
1117 *
1118 * Locking: Caller must hold vol->mftbmp_lock for writing.
1119 */
1120static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
1121 ntfs_inode *base_ni)
1122{
1123 s64 pass_end, ll, data_pos, pass_start, ofs, bit;
1124 struct address_space *mftbmp_mapping;
1125 u8 *buf, *byte;
1126 struct page *page;
1127 unsigned int page_ofs, size;
1128 u8 pass, b;
1129
1130 ntfs_debug("Searching for free mft record in the currently "
1131 "initialized mft bitmap.");
1132 mftbmp_mapping = vol->mftbmp_ino->i_mapping;
1133 /*
1134 * Set the end of the pass making sure we do not overflow the mft
1135 * bitmap.
1136 */
1137 pass_end = NTFS_I(vol->mft_ino)->allocated_size >>
1138 vol->mft_record_size_bits;
1139 ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3;
1140 if (pass_end > ll)
1141 pass_end = ll;
1142 pass = 1;
1143 if (!base_ni)
1144 data_pos = vol->mft_data_pos;
1145 else
1146 data_pos = base_ni->mft_no + 1;
1147 if (data_pos < 24)
1148 data_pos = 24;
1149 if (data_pos >= pass_end) {
1150 data_pos = 24;
1151 pass = 2;
1152 /* This happens on a freshly formatted volume. */
1153 if (data_pos >= pass_end)
1154 return -ENOSPC;
1155 }
1156 pass_start = data_pos;
1157 ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, "
1158 "pass_end 0x%llx, data_pos 0x%llx.", pass,
1159 (long long)pass_start, (long long)pass_end,
1160 (long long)data_pos);
1161 /* Loop until a free mft record is found. */
1162 for (; pass <= 2;) {
1163 /* Cap size to pass_end. */
1164 ofs = data_pos >> 3;
1165 page_ofs = ofs & ~PAGE_CACHE_MASK;
1166 size = PAGE_CACHE_SIZE - page_ofs;
1167 ll = ((pass_end + 7) >> 3) - ofs;
1168 if (size > ll)
1169 size = ll;
1170 size <<= 3;
1171 /*
1172 * If we are still within the active pass, search the next page
1173 * for a zero bit.
1174 */
1175 if (size) {
1176 page = ntfs_map_page(mftbmp_mapping,
1177 ofs >> PAGE_CACHE_SHIFT);
1178 if (unlikely(IS_ERR(page))) {
1179 ntfs_error(vol->sb, "Failed to read mft "
1180 "bitmap, aborting.");
1181 return PTR_ERR(page);
1182 }
1183 buf = (u8*)page_address(page) + page_ofs;
1184 bit = data_pos & 7;
1185 data_pos &= ~7ull;
1186 ntfs_debug("Before inner for loop: size 0x%x, "
1187 "data_pos 0x%llx, bit 0x%llx", size,
1188 (long long)data_pos, (long long)bit);
1189 for (; bit < size && data_pos + bit < pass_end;
1190 bit &= ~7ull, bit += 8) {
1191 byte = buf + (bit >> 3);
1192 if (*byte == 0xff)
1193 continue;
1194 b = ffz((unsigned long)*byte);
1195 if (b < 8 && b >= (bit & 7)) {
1196 ll = data_pos + (bit & ~7ull) + b;
1197 if (unlikely(ll > (1ll << 32))) {
1198 ntfs_unmap_page(page);
1199 return -ENOSPC;
1200 }
1201 *byte |= 1 << b;
1202 flush_dcache_page(page);
1203 set_page_dirty(page);
1204 ntfs_unmap_page(page);
1205 ntfs_debug("Done. (Found and "
1206 "allocated mft record "
1207 "0x%llx.)",
1208 (long long)ll);
1209 return ll;
1210 }
1211 }
1212 ntfs_debug("After inner for loop: size 0x%x, "
1213 "data_pos 0x%llx, bit 0x%llx", size,
1214 (long long)data_pos, (long long)bit);
1215 data_pos += size;
1216 ntfs_unmap_page(page);
1217 /*
1218 * If the end of the pass has not been reached yet,
1219 * continue searching the mft bitmap for a zero bit.
1220 */
1221 if (data_pos < pass_end)
1222 continue;
1223 }
1224 /* Do the next pass. */
1225 if (++pass == 2) {
1226 /*
1227 * Starting the second pass, in which we scan the first
1228 * part of the zone which we omitted earlier.
1229 */
1230 pass_end = pass_start;
1231 data_pos = pass_start = 24;
1232 ntfs_debug("pass %i, pass_start 0x%llx, pass_end "
1233 "0x%llx.", pass, (long long)pass_start,
1234 (long long)pass_end);
1235 if (data_pos >= pass_end)
1236 break;
1237 }
1238 }
1239 /* No free mft records in currently initialized mft bitmap. */
1240 ntfs_debug("Done. (No free mft records left in currently initialized "
1241 "mft bitmap.)");
1242 return -ENOSPC;
1243}
1244
1245/**
1246 * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster
1247 * @vol: volume on which to extend the mft bitmap attribute
1248 *
1249 * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster.
1250 *
1251 * Note: Only changes allocated_size, i.e. does not touch initialized_size or
1252 * data_size.
1253 *
1254 * Return 0 on success and -errno on error.
1255 *
1256 * Locking: - Caller must hold vol->mftbmp_lock for writing.
1257 * - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for
1258 * writing and releases it before returning.
1259 * - This function takes vol->lcnbmp_lock for writing and releases it
1260 * before returning.
1261 */
1262static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
1263{
1264 LCN lcn;
1265 s64 ll;
1266 struct page *page;
1267 ntfs_inode *mft_ni, *mftbmp_ni;
1268 runlist_element *rl, *rl2 = NULL;
1269 ntfs_attr_search_ctx *ctx = NULL;
1270 MFT_RECORD *mrec;
1271 ATTR_RECORD *a = NULL;
1272 int ret, mp_size;
1273 u32 old_alen = 0;
1274 u8 *b, tb;
1275 struct {
1276 u8 added_cluster:1;
1277 u8 added_run:1;
1278 u8 mp_rebuilt:1;
1279 } status = { 0, 0, 0 };
1280
1281 ntfs_debug("Extending mft bitmap allocation.");
1282 mft_ni = NTFS_I(vol->mft_ino);
1283 mftbmp_ni = NTFS_I(vol->mftbmp_ino);
1284 /*
1285 * Determine the last lcn of the mft bitmap. The allocated size of the
1286 * mft bitmap cannot be zero so we are ok to do this.
1287 * ntfs_find_vcn() returns the runlist locked on success.
1288 */
1289 rl = ntfs_find_vcn(mftbmp_ni, (mftbmp_ni->allocated_size - 1) >>
1290 vol->cluster_size_bits, TRUE);
1291 if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
1292 ntfs_error(vol->sb, "Failed to determine last allocated "
1293 "cluster of mft bitmap attribute.");
1294 if (!IS_ERR(rl)) {
1295 up_write(&mftbmp_ni->runlist.lock);
1296 ret = -EIO;
1297 } else
1298 ret = PTR_ERR(rl);
1299 return ret;
1300 }
1301 lcn = rl->lcn + rl->length;
1302 ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.",
1303 (long long)lcn);
1304 /*
1305 * Attempt to get the cluster following the last allocated cluster by
1306 * hand as it may be in the MFT zone so the allocator would not give it
1307 * to us.
1308 */
1309 ll = lcn >> 3;
1310 page = ntfs_map_page(vol->lcnbmp_ino->i_mapping,
1311 ll >> PAGE_CACHE_SHIFT);
1312 if (IS_ERR(page)) {
1313 up_write(&mftbmp_ni->runlist.lock);
1314 ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
1315 return PTR_ERR(page);
1316 }
1317 b = (u8*)page_address(page) + (ll & ~PAGE_CACHE_MASK);
1318 tb = 1 << (lcn & 7ull);
1319 down_write(&vol->lcnbmp_lock);
1320 if (*b != 0xff && !(*b & tb)) {
1321 /* Next cluster is free, allocate it. */
1322 *b |= tb;
1323 flush_dcache_page(page);
1324 set_page_dirty(page);
1325 up_write(&vol->lcnbmp_lock);
1326 ntfs_unmap_page(page);
1327 /* Update the mft bitmap runlist. */
1328 rl->length++;
1329 rl[1].vcn++;
1330 status.added_cluster = 1;
1331 ntfs_debug("Appending one cluster to mft bitmap.");
1332 } else {
1333 up_write(&vol->lcnbmp_lock);
1334 ntfs_unmap_page(page);
1335 /* Allocate a cluster from the DATA_ZONE. */
1336 rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE);
1337 if (IS_ERR(rl2)) {
1338 up_write(&mftbmp_ni->runlist.lock);
1339 ntfs_error(vol->sb, "Failed to allocate a cluster for "
1340 "the mft bitmap.");
1341 return PTR_ERR(rl2);
1342 }
1343 rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2);
1344 if (IS_ERR(rl)) {
1345 up_write(&mftbmp_ni->runlist.lock);
1346 ntfs_error(vol->sb, "Failed to merge runlists for mft "
1347 "bitmap.");
1348 if (ntfs_cluster_free_from_rl(vol, rl2)) {
1349 ntfs_error(vol->sb, "Failed to dealocate "
1350 "allocated cluster.%s", es);
1351 NVolSetErrors(vol);
1352 }
1353 ntfs_free(rl2);
1354 return PTR_ERR(rl);
1355 }
1356 mftbmp_ni->runlist.rl = rl;
1357 status.added_run = 1;
1358 ntfs_debug("Adding one run to mft bitmap.");
1359 /* Find the last run in the new runlist. */
1360 for (; rl[1].length; rl++)
1361 ;
1362 }
1363 /*
1364 * Update the attribute record as well. Note: @rl is the last
1365 * (non-terminator) runlist element of mft bitmap.
1366 */
1367 mrec = map_mft_record(mft_ni);
1368 if (IS_ERR(mrec)) {
1369 ntfs_error(vol->sb, "Failed to map mft record.");
1370 ret = PTR_ERR(mrec);
1371 goto undo_alloc;
1372 }
1373 ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1374 if (unlikely(!ctx)) {
1375 ntfs_error(vol->sb, "Failed to get search context.");
1376 ret = -ENOMEM;
1377 goto undo_alloc;
1378 }
1379 ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1380 mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
1381 0, ctx);
1382 if (unlikely(ret)) {
1383 ntfs_error(vol->sb, "Failed to find last attribute extent of "
1384 "mft bitmap attribute.");
1385 if (ret == -ENOENT)
1386 ret = -EIO;
1387 goto undo_alloc;
1388 }
1389 a = ctx->attr;
1390 ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
1391 /* Search back for the previous last allocated cluster of mft bitmap. */
1392 for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) {
1393 if (ll >= rl2->vcn)
1394 break;
1395 }
1396 BUG_ON(ll < rl2->vcn);
1397 BUG_ON(ll >= rl2->vcn + rl2->length);
1398 /* Get the size for the new mapping pairs array for this extent. */
1399 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll);
1400 if (unlikely(mp_size <= 0)) {
1401 ntfs_error(vol->sb, "Get size for mapping pairs failed for "
1402 "mft bitmap attribute extent.");
1403 ret = mp_size;
1404 if (!ret)
1405 ret = -EIO;
1406 goto undo_alloc;
1407 }
1408 /* Expand the attribute record if necessary. */
1409 old_alen = le32_to_cpu(a->length);
1410 ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
1411 le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
1412 if (unlikely(ret)) {
1413 if (ret != -ENOSPC) {
1414 ntfs_error(vol->sb, "Failed to resize attribute "
1415 "record for mft bitmap attribute.");
1416 goto undo_alloc;
1417 }
1418 // TODO: Deal with this by moving this extent to a new mft
1419 // record or by starting a new extent in a new mft record or by
1420 // moving other attributes out of this mft record.
1421 ntfs_error(vol->sb, "Not enough space in this mft record to "
1422 "accomodate extended mft bitmap attribute "
1423 "extent. Cannot handle this yet.");
1424 ret = -EOPNOTSUPP;
1425 goto undo_alloc;
1426 }
1427 status.mp_rebuilt = 1;
1428 /* Generate the mapping pairs array directly into the attr record. */
1429 ret = ntfs_mapping_pairs_build(vol, (u8*)a +
1430 le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
1431 mp_size, rl2, ll, NULL);
1432 if (unlikely(ret)) {
1433 ntfs_error(vol->sb, "Failed to build mapping pairs array for "
1434 "mft bitmap attribute.");
1435 goto undo_alloc;
1436 }
1437 /* Update the highest_vcn. */
1438 a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
1439 /*
1440 * We now have extended the mft bitmap allocated_size by one cluster.
1441 * Reflect this in the ntfs_inode structure and the attribute record.
1442 */
1443 if (a->data.non_resident.lowest_vcn) {
1444 /*
1445 * We are not in the first attribute extent, switch to it, but
1446 * first ensure the changes will make it to disk later.
1447 */
1448 flush_dcache_mft_record_page(ctx->ntfs_ino);
1449 mark_mft_record_dirty(ctx->ntfs_ino);
1450 ntfs_attr_reinit_search_ctx(ctx);
1451 ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1452 mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL,
1453 0, ctx);
1454 if (unlikely(ret)) {
1455 ntfs_error(vol->sb, "Failed to find first attribute "
1456 "extent of mft bitmap attribute.");
1457 goto restore_undo_alloc;
1458 }
1459 a = ctx->attr;
1460 }
1461 mftbmp_ni->allocated_size += vol->cluster_size;
1462 a->data.non_resident.allocated_size =
1463 cpu_to_sle64(mftbmp_ni->allocated_size);
1464 /* Ensure the changes make it to disk. */
1465 flush_dcache_mft_record_page(ctx->ntfs_ino);
1466 mark_mft_record_dirty(ctx->ntfs_ino);
1467 ntfs_attr_put_search_ctx(ctx);
1468 unmap_mft_record(mft_ni);
1469 up_write(&mftbmp_ni->runlist.lock);
1470 ntfs_debug("Done.");
1471 return 0;
1472restore_undo_alloc:
1473 ntfs_attr_reinit_search_ctx(ctx);
1474 if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1475 mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
1476 0, ctx)) {
1477 ntfs_error(vol->sb, "Failed to find last attribute extent of "
1478 "mft bitmap attribute.%s", es);
1479 mftbmp_ni->allocated_size += vol->cluster_size;
1480 ntfs_attr_put_search_ctx(ctx);
1481 unmap_mft_record(mft_ni);
1482 up_write(&mftbmp_ni->runlist.lock);
1483 /*
1484 * The only thing that is now wrong is ->allocated_size of the
1485 * base attribute extent which chkdsk should be able to fix.
1486 */
1487 NVolSetErrors(vol);
1488 return ret;
1489 }
1490 a = ctx->attr;
1491 a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2);
1492undo_alloc:
1493 if (status.added_cluster) {
1494 /* Truncate the last run in the runlist by one cluster. */
1495 rl->length--;
1496 rl[1].vcn--;
1497 } else if (status.added_run) {
1498 lcn = rl->lcn;
1499 /* Remove the last run from the runlist. */
1500 rl->lcn = rl[1].lcn;
1501 rl->length = 0;
1502 }
1503 /* Deallocate the cluster. */
1504 down_write(&vol->lcnbmp_lock);
1505 if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
1506 ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es);
1507 NVolSetErrors(vol);
1508 }
1509 up_write(&vol->lcnbmp_lock);
1510 if (status.mp_rebuilt) {
1511 if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
1512 a->data.non_resident.mapping_pairs_offset),
1513 old_alen - le16_to_cpu(
1514 a->data.non_resident.mapping_pairs_offset),
1515 rl2, ll, NULL)) {
1516 ntfs_error(vol->sb, "Failed to restore mapping pairs "
1517 "array.%s", es);
1518 NVolSetErrors(vol);
1519 }
1520 if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
1521 ntfs_error(vol->sb, "Failed to restore attribute "
1522 "record.%s", es);
1523 NVolSetErrors(vol);
1524 }
1525 flush_dcache_mft_record_page(ctx->ntfs_ino);
1526 mark_mft_record_dirty(ctx->ntfs_ino);
1527 }
1528 if (ctx)
1529 ntfs_attr_put_search_ctx(ctx);
1530 if (!IS_ERR(mrec))
1531 unmap_mft_record(mft_ni);
1532 up_write(&mftbmp_ni->runlist.lock);
1533 return ret;
1534}
1535
1536/**
1537 * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data
1538 * @vol: volume on which to extend the mft bitmap attribute
1539 *
1540 * Extend the initialized portion of the mft bitmap attribute on the ntfs
1541 * volume @vol by 8 bytes.
1542 *
1543 * Note: Only changes initialized_size and data_size, i.e. requires that
1544 * allocated_size is big enough to fit the new initialized_size.
1545 *
1546 * Return 0 on success and -error on error.
1547 *
1548 * Locking: Caller must hold vol->mftbmp_lock for writing.
1549 */
1550static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
1551{
1552 s64 old_data_size, old_initialized_size;
1553 struct inode *mftbmp_vi;
1554 ntfs_inode *mft_ni, *mftbmp_ni;
1555 ntfs_attr_search_ctx *ctx;
1556 MFT_RECORD *mrec;
1557 ATTR_RECORD *a;
1558 int ret;
1559
1560 ntfs_debug("Extending mft bitmap initiailized (and data) size.");
1561 mft_ni = NTFS_I(vol->mft_ino);
1562 mftbmp_vi = vol->mftbmp_ino;
1563 mftbmp_ni = NTFS_I(mftbmp_vi);
1564 /* Get the attribute record. */
1565 mrec = map_mft_record(mft_ni);
1566 if (IS_ERR(mrec)) {
1567 ntfs_error(vol->sb, "Failed to map mft record.");
1568 return PTR_ERR(mrec);
1569 }
1570 ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1571 if (unlikely(!ctx)) {
1572 ntfs_error(vol->sb, "Failed to get search context.");
1573 ret = -ENOMEM;
1574 goto unm_err_out;
1575 }
1576 ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1577 mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx);
1578 if (unlikely(ret)) {
1579 ntfs_error(vol->sb, "Failed to find first attribute extent of "
1580 "mft bitmap attribute.");
1581 if (ret == -ENOENT)
1582 ret = -EIO;
1583 goto put_err_out;
1584 }
1585 a = ctx->attr;
1586 old_data_size = mftbmp_vi->i_size;
1587 old_initialized_size = mftbmp_ni->initialized_size;
1588 /*
1589 * We can simply update the initialized_size before filling the space
1590 * with zeroes because the caller is holding the mft bitmap lock for
1591 * writing which ensures that no one else is trying to access the data.
1592 */
1593 mftbmp_ni->initialized_size += 8;
1594 a->data.non_resident.initialized_size =
1595 cpu_to_sle64(mftbmp_ni->initialized_size);
1596 if (mftbmp_ni->initialized_size > mftbmp_vi->i_size) {
1597 mftbmp_vi->i_size = mftbmp_ni->initialized_size;
1598 a->data.non_resident.data_size =
1599 cpu_to_sle64(mftbmp_vi->i_size);
1600 }
1601 /* Ensure the changes make it to disk. */
1602 flush_dcache_mft_record_page(ctx->ntfs_ino);
1603 mark_mft_record_dirty(ctx->ntfs_ino);
1604 ntfs_attr_put_search_ctx(ctx);
1605 unmap_mft_record(mft_ni);
1606 /* Initialize the mft bitmap attribute value with zeroes. */
1607 ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0);
1608 if (likely(!ret)) {
1609 ntfs_debug("Done. (Wrote eight initialized bytes to mft "
1610 "bitmap.");
1611 return 0;
1612 }
1613 ntfs_error(vol->sb, "Failed to write to mft bitmap.");
1614 /* Try to recover from the error. */
1615 mrec = map_mft_record(mft_ni);
1616 if (IS_ERR(mrec)) {
1617 ntfs_error(vol->sb, "Failed to map mft record.%s", es);
1618 NVolSetErrors(vol);
1619 return ret;
1620 }
1621 ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1622 if (unlikely(!ctx)) {
1623 ntfs_error(vol->sb, "Failed to get search context.%s", es);
1624 NVolSetErrors(vol);
1625 goto unm_err_out;
1626 }
1627 if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1628 mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) {
1629 ntfs_error(vol->sb, "Failed to find first attribute extent of "
1630 "mft bitmap attribute.%s", es);
1631 NVolSetErrors(vol);
1632put_err_out:
1633 ntfs_attr_put_search_ctx(ctx);
1634unm_err_out:
1635 unmap_mft_record(mft_ni);
1636 goto err_out;
1637 }
1638 a = ctx->attr;
1639 mftbmp_ni->initialized_size = old_initialized_size;
1640 a->data.non_resident.initialized_size =
1641 cpu_to_sle64(old_initialized_size);
1642 if (mftbmp_vi->i_size != old_data_size) {
1643 mftbmp_vi->i_size = old_data_size;
1644 a->data.non_resident.data_size = cpu_to_sle64(old_data_size);
1645 }
1646 flush_dcache_mft_record_page(ctx->ntfs_ino);
1647 mark_mft_record_dirty(ctx->ntfs_ino);
1648 ntfs_attr_put_search_ctx(ctx);
1649 unmap_mft_record(mft_ni);
1650 ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, "
1651 "data_size 0x%llx, initialized_size 0x%llx.",
1652 (long long)mftbmp_ni->allocated_size,
1653 (long long)mftbmp_vi->i_size,
1654 (long long)mftbmp_ni->initialized_size);
1655err_out:
1656 return ret;
1657}
1658
1659/**
1660 * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute
1661 * @vol: volume on which to extend the mft data attribute
1662 *
1663 * Extend the mft data attribute on the ntfs volume @vol by 16 mft records
1664 * worth of clusters or if not enough space for this by one mft record worth
1665 * of clusters.
1666 *
1667 * Note: Only changes allocated_size, i.e. does not touch initialized_size or
1668 * data_size.
1669 *
1670 * Return 0 on success and -errno on error.
1671 *
1672 * Locking: - Caller must hold vol->mftbmp_lock for writing.
1673 * - This function takes NTFS_I(vol->mft_ino)->runlist.lock for
1674 * writing and releases it before returning.
1675 * - This function calls functions which take vol->lcnbmp_lock for
1676 * writing and release it before returning.
1677 */
1678static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
1679{
1680 LCN lcn;
1681 VCN old_last_vcn;
1682 s64 min_nr, nr, ll = 0;
1683 ntfs_inode *mft_ni;
1684 runlist_element *rl, *rl2;
1685 ntfs_attr_search_ctx *ctx = NULL;
1686 MFT_RECORD *mrec;
1687 ATTR_RECORD *a = NULL;
1688 int ret, mp_size;
1689 u32 old_alen = 0;
1690 BOOL mp_rebuilt = FALSE;
1691
1692 ntfs_debug("Extending mft data allocation.");
1693 mft_ni = NTFS_I(vol->mft_ino);
1694 /*
1695 * Determine the preferred allocation location, i.e. the last lcn of
1696 * the mft data attribute. The allocated size of the mft data
1697 * attribute cannot be zero so we are ok to do this.
1698 * ntfs_find_vcn() returns the runlist locked on success.
1699 */
1700 rl = ntfs_find_vcn(mft_ni, (mft_ni->allocated_size - 1) >>
1701 vol->cluster_size_bits, TRUE);
1702 if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
1703 ntfs_error(vol->sb, "Failed to determine last allocated "
1704 "cluster of mft data attribute.");
1705 if (!IS_ERR(rl)) {
1706 up_write(&mft_ni->runlist.lock);
1707 ret = -EIO;
1708 } else
1709 ret = PTR_ERR(rl);
1710 return ret;
1711 }
1712 lcn = rl->lcn + rl->length;
1713 ntfs_debug("Last lcn of mft data attribute is 0x%llx.",
1714 (long long)lcn);
1715 /* Minimum allocation is one mft record worth of clusters. */
1716 min_nr = vol->mft_record_size >> vol->cluster_size_bits;
1717 if (!min_nr)
1718 min_nr = 1;
1719 /* Want to allocate 16 mft records worth of clusters. */
1720 nr = vol->mft_record_size << 4 >> vol->cluster_size_bits;
1721 if (!nr)
1722 nr = min_nr;
1723 /* Ensure we do not go above 2^32-1 mft records. */
1724 if (unlikely((mft_ni->allocated_size +
1725 (nr << vol->cluster_size_bits)) >>
1726 vol->mft_record_size_bits >= (1ll << 32))) {
1727 nr = min_nr;
1728 if (unlikely((mft_ni->allocated_size +
1729 (nr << vol->cluster_size_bits)) >>
1730 vol->mft_record_size_bits >= (1ll << 32))) {
1731 ntfs_warning(vol->sb, "Cannot allocate mft record "
1732 "because the maximum number of inodes "
1733 "(2^32) has already been reached.");
1734 up_write(&mft_ni->runlist.lock);
1735 return -ENOSPC;
1736 }
1737 }
1738 ntfs_debug("Trying mft data allocation with %s cluster count %lli.",
1739 nr > min_nr ? "default" : "minimal", (long long)nr);
1740 old_last_vcn = rl[1].vcn;
1741 do {
1742 rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE);
1743 if (likely(!IS_ERR(rl2)))
1744 break;
1745 if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
1746 ntfs_error(vol->sb, "Failed to allocate the minimal "
1747 "number of clusters (%lli) for the "
1748 "mft data attribute.", (long long)nr);
1749 up_write(&mft_ni->runlist.lock);
1750 return PTR_ERR(rl2);
1751 }
1752 /*
1753 * There is not enough space to do the allocation, but there
1754 * might be enough space to do a minimal allocation so try that
1755 * before failing.
1756 */
1757 nr = min_nr;
1758 ntfs_debug("Retrying mft data allocation with minimal cluster "
1759 "count %lli.", (long long)nr);
1760 } while (1);
1761 rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2);
1762 if (IS_ERR(rl)) {
1763 up_write(&mft_ni->runlist.lock);
1764 ntfs_error(vol->sb, "Failed to merge runlists for mft data "
1765 "attribute.");
1766 if (ntfs_cluster_free_from_rl(vol, rl2)) {
1767 ntfs_error(vol->sb, "Failed to dealocate clusters "
1768 "from the mft data attribute.%s", es);
1769 NVolSetErrors(vol);
1770 }
1771 ntfs_free(rl2);
1772 return PTR_ERR(rl);
1773 }
1774 mft_ni->runlist.rl = rl;
1775 ntfs_debug("Allocated %lli clusters.", nr);
1776 /* Find the last run in the new runlist. */
1777 for (; rl[1].length; rl++)
1778 ;
1779 /* Update the attribute record as well. */
1780 mrec = map_mft_record(mft_ni);
1781 if (IS_ERR(mrec)) {
1782 ntfs_error(vol->sb, "Failed to map mft record.");
1783 ret = PTR_ERR(mrec);
1784 goto undo_alloc;
1785 }
1786 ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1787 if (unlikely(!ctx)) {
1788 ntfs_error(vol->sb, "Failed to get search context.");
1789 ret = -ENOMEM;
1790 goto undo_alloc;
1791 }
1792 ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
1793 CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx);
1794 if (unlikely(ret)) {
1795 ntfs_error(vol->sb, "Failed to find last attribute extent of "
1796 "mft data attribute.");
1797 if (ret == -ENOENT)
1798 ret = -EIO;
1799 goto undo_alloc;
1800 }
1801 a = ctx->attr;
1802 ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
1803 /* Search back for the previous last allocated cluster of mft bitmap. */
1804 for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) {
1805 if (ll >= rl2->vcn)
1806 break;
1807 }
1808 BUG_ON(ll < rl2->vcn);
1809 BUG_ON(ll >= rl2->vcn + rl2->length);
1810 /* Get the size for the new mapping pairs array for this extent. */
1811 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll);
1812 if (unlikely(mp_size <= 0)) {
1813 ntfs_error(vol->sb, "Get size for mapping pairs failed for "
1814 "mft data attribute extent.");
1815 ret = mp_size;
1816 if (!ret)
1817 ret = -EIO;
1818 goto undo_alloc;
1819 }
1820 /* Expand the attribute record if necessary. */
1821 old_alen = le32_to_cpu(a->length);
1822 ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
1823 le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
1824 if (unlikely(ret)) {
1825 if (ret != -ENOSPC) {
1826 ntfs_error(vol->sb, "Failed to resize attribute "
1827 "record for mft data attribute.");
1828 goto undo_alloc;
1829 }
1830 // TODO: Deal with this by moving this extent to a new mft
1831 // record or by starting a new extent in a new mft record or by
1832 // moving other attributes out of this mft record.
1833 // Note: Use the special reserved mft records and ensure that
1834 // this extent is not required to find the mft record in
1835 // question.
1836 ntfs_error(vol->sb, "Not enough space in this mft record to "
1837 "accomodate extended mft data attribute "
1838 "extent. Cannot handle this yet.");
1839 ret = -EOPNOTSUPP;
1840 goto undo_alloc;
1841 }
1842 mp_rebuilt = TRUE;
1843 /* Generate the mapping pairs array directly into the attr record. */
1844 ret = ntfs_mapping_pairs_build(vol, (u8*)a +
1845 le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
1846 mp_size, rl2, ll, NULL);
1847 if (unlikely(ret)) {
1848 ntfs_error(vol->sb, "Failed to build mapping pairs array of "
1849 "mft data attribute.");
1850 goto undo_alloc;
1851 }
1852 /* Update the highest_vcn. */
1853 a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
1854 /*
1855 * We now have extended the mft data allocated_size by nr clusters.
1856 * Reflect this in the ntfs_inode structure and the attribute record.
1857 * @rl is the last (non-terminator) runlist element of mft data
1858 * attribute.
1859 */
1860 if (a->data.non_resident.lowest_vcn) {
1861 /*
1862 * We are not in the first attribute extent, switch to it, but
1863 * first ensure the changes will make it to disk later.
1864 */
1865 flush_dcache_mft_record_page(ctx->ntfs_ino);
1866 mark_mft_record_dirty(ctx->ntfs_ino);
1867 ntfs_attr_reinit_search_ctx(ctx);
1868 ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name,
1869 mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0,
1870 ctx);
1871 if (unlikely(ret)) {
1872 ntfs_error(vol->sb, "Failed to find first attribute "
1873 "extent of mft data attribute.");
1874 goto restore_undo_alloc;
1875 }
1876 a = ctx->attr;
1877 }
1878 mft_ni->allocated_size += nr << vol->cluster_size_bits;
1879 a->data.non_resident.allocated_size =
1880 cpu_to_sle64(mft_ni->allocated_size);
1881 /* Ensure the changes make it to disk. */
1882 flush_dcache_mft_record_page(ctx->ntfs_ino);
1883 mark_mft_record_dirty(ctx->ntfs_ino);
1884 ntfs_attr_put_search_ctx(ctx);
1885 unmap_mft_record(mft_ni);
1886 up_write(&mft_ni->runlist.lock);
1887 ntfs_debug("Done.");
1888 return 0;
1889restore_undo_alloc:
1890 ntfs_attr_reinit_search_ctx(ctx);
1891 if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
1892 CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) {
1893 ntfs_error(vol->sb, "Failed to find last attribute extent of "
1894 "mft data attribute.%s", es);
1895 mft_ni->allocated_size += nr << vol->cluster_size_bits;
1896 ntfs_attr_put_search_ctx(ctx);
1897 unmap_mft_record(mft_ni);
1898 up_write(&mft_ni->runlist.lock);
1899 /*
1900 * The only thing that is now wrong is ->allocated_size of the
1901 * base attribute extent which chkdsk should be able to fix.
1902 */
1903 NVolSetErrors(vol);
1904 return ret;
1905 }
1906 a = ctx->attr;
1907 a->data.non_resident.highest_vcn = cpu_to_sle64(old_last_vcn - 1);
1908undo_alloc:
1909 if (ntfs_cluster_free(vol->mft_ino, old_last_vcn, -1) < 0) {
1910 ntfs_error(vol->sb, "Failed to free clusters from mft data "
1911 "attribute.%s", es);
1912 NVolSetErrors(vol);
1913 }
1914 if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
1915 ntfs_error(vol->sb, "Failed to truncate mft data attribute "
1916 "runlist.%s", es);
1917 NVolSetErrors(vol);
1918 }
1919 if (mp_rebuilt) {
1920 if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
1921 a->data.non_resident.mapping_pairs_offset),
1922 old_alen - le16_to_cpu(
1923 a->data.non_resident.mapping_pairs_offset),
1924 rl2, ll, NULL)) {
1925 ntfs_error(vol->sb, "Failed to restore mapping pairs "
1926 "array.%s", es);
1927 NVolSetErrors(vol);
1928 }
1929 if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
1930 ntfs_error(vol->sb, "Failed to restore attribute "
1931 "record.%s", es);
1932 NVolSetErrors(vol);
1933 }
1934 flush_dcache_mft_record_page(ctx->ntfs_ino);
1935 mark_mft_record_dirty(ctx->ntfs_ino);
1936 }
1937 if (ctx)
1938 ntfs_attr_put_search_ctx(ctx);
1939 if (!IS_ERR(mrec))
1940 unmap_mft_record(mft_ni);
1941 up_write(&mft_ni->runlist.lock);
1942 return ret;
1943}
1944
1945/**
1946 * ntfs_mft_record_layout - layout an mft record into a memory buffer
1947 * @vol: volume to which the mft record will belong
1948 * @mft_no: mft reference specifying the mft record number
1949 * @m: destination buffer of size >= @vol->mft_record_size bytes
1950 *
1951 * Layout an empty, unused mft record with the mft record number @mft_no into
1952 * the buffer @m. The volume @vol is needed because the mft record structure
1953 * was modified in NTFS 3.1 so we need to know which volume version this mft
1954 * record will be used on.
1955 *
1956 * Return 0 on success and -errno on error.
1957 */
1958static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no,
1959 MFT_RECORD *m)
1960{
1961 ATTR_RECORD *a;
1962
1963 ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
1964 if (mft_no >= (1ll << 32)) {
1965 ntfs_error(vol->sb, "Mft record number 0x%llx exceeds "
1966 "maximum of 2^32.", (long long)mft_no);
1967 return -ERANGE;
1968 }
1969 /* Start by clearing the whole mft record to gives us a clean slate. */
1970 memset(m, 0, vol->mft_record_size);
1971 /* Aligned to 2-byte boundary. */
1972 if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver))
1973 m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1);
1974 else {
1975 m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
1976 /*
1977 * Set the NTFS 3.1+ specific fields while we know that the
1978 * volume version is 3.1+.
1979 */
1980 m->reserved = 0;
1981 m->mft_record_number = cpu_to_le32((u32)mft_no);
1982 }
1983 m->magic = magic_FILE;
1984 if (vol->mft_record_size >= NTFS_BLOCK_SIZE)
1985 m->usa_count = cpu_to_le16(vol->mft_record_size /
1986 NTFS_BLOCK_SIZE + 1);
1987 else {
1988 m->usa_count = cpu_to_le16(1);
1989 ntfs_warning(vol->sb, "Sector size is bigger than mft record "
1990 "size. Setting usa_count to 1. If chkdsk "
1991 "reports this as corruption, please email "
1992 "linux-ntfs-dev@lists.sourceforge.net stating "
1993 "that you saw this message and that the "
1994 "modified file system created was corrupt. "
1995 "Thank you.");
1996 }
1997 /* Set the update sequence number to 1. */
1998 *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1);
1999 m->lsn = 0;
2000 m->sequence_number = cpu_to_le16(1);
2001 m->link_count = 0;
2002 /*
2003 * Place the attributes straight after the update sequence array,
2004 * aligned to 8-byte boundary.
2005 */
2006 m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
2007 (le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
2008 m->flags = 0;
2009 /*
2010 * Using attrs_offset plus eight bytes (for the termination attribute).
2011 * attrs_offset is already aligned to 8-byte boundary, so no need to
2012 * align again.
2013 */
2014 m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8);
2015 m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
2016 m->base_mft_record = 0;
2017 m->next_attr_instance = 0;
2018 /* Add the termination attribute. */
2019 a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
2020 a->type = AT_END;
2021 a->length = 0;
2022 ntfs_debug("Done.");
2023 return 0;
2024}
2025
2026/**
2027 * ntfs_mft_record_format - format an mft record on an ntfs volume
2028 * @vol: volume on which to format the mft record
2029 * @mft_no: mft record number to format
2030 *
2031 * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused
2032 * mft record into the appropriate place of the mft data attribute. This is
2033 * used when extending the mft data attribute.
2034 *
2035 * Return 0 on success and -errno on error.
2036 */
2037static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
2038{
2039 struct inode *mft_vi = vol->mft_ino;
2040 struct page *page;
2041 MFT_RECORD *m;
2042 pgoff_t index, end_index;
2043 unsigned int ofs;
2044 int err;
2045
2046 ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
2047 /*
2048 * The index into the page cache and the offset within the page cache
2049 * page of the wanted mft record.
2050 */
2051 index = mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
2052 ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
2053 /* The maximum valid index into the page cache for $MFT's data. */
2054 end_index = mft_vi->i_size >> PAGE_CACHE_SHIFT;
2055 if (unlikely(index >= end_index)) {
2056 if (unlikely(index > end_index || ofs + vol->mft_record_size >=
2057 (mft_vi->i_size & ~PAGE_CACHE_MASK))) {
2058 ntfs_error(vol->sb, "Tried to format non-existing mft "
2059 "record 0x%llx.", (long long)mft_no);
2060 return -ENOENT;
2061 }
2062 }
2063 /* Read, map, and pin the page containing the mft record. */
2064 page = ntfs_map_page(mft_vi->i_mapping, index);
2065 if (unlikely(IS_ERR(page))) {
2066 ntfs_error(vol->sb, "Failed to map page containing mft record "
2067 "to format 0x%llx.", (long long)mft_no);
2068 return PTR_ERR(page);
2069 }
2070 lock_page(page);
2071 BUG_ON(!PageUptodate(page));
2072 ClearPageUptodate(page);
2073 m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
2074 err = ntfs_mft_record_layout(vol, mft_no, m);
2075 if (unlikely(err)) {
2076 ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.",
2077 (long long)mft_no);
2078 SetPageUptodate(page);
2079 unlock_page(page);
2080 ntfs_unmap_page(page);
2081 return err;
2082 }
2083 flush_dcache_page(page);
2084 SetPageUptodate(page);
2085 unlock_page(page);
2086 /*
2087 * Make sure the mft record is written out to disk. We could use
2088 * ilookup5() to check if an inode is in icache and so on but this is
2089 * unnecessary as ntfs_writepage() will write the dirty record anyway.
2090 */
2091 mark_ntfs_record_dirty(page, ofs);
2092 ntfs_unmap_page(page);
2093 ntfs_debug("Done.");
2094 return 0;
2095}
2096
2097/**
2098 * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
2099 * @vol: [IN] volume on which to allocate the mft record
2100 * @mode: [IN] mode if want a file or directory, i.e. base inode or 0
2101 * @base_ni: [IN] open base inode if allocating an extent mft record or NULL
2102 * @mrec: [OUT] on successful return this is the mapped mft record
2103 *
2104 * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol.
2105 *
2106 * If @base_ni is NULL make the mft record a base mft record, i.e. a file or
2107 * direvctory inode, and allocate it at the default allocator position. In
2108 * this case @mode is the file mode as given to us by the caller. We in
2109 * particular use @mode to distinguish whether a file or a directory is being
2110 * created (S_IFDIR(mode) and S_IFREG(mode), respectively).
2111 *
2112 * If @base_ni is not NULL make the allocated mft record an extent record,
2113 * allocate it starting at the mft record after the base mft record and attach
2114 * the allocated and opened ntfs inode to the base inode @base_ni. In this
2115 * case @mode must be 0 as it is meaningless for extent inodes.
2116 *
2117 * You need to check the return value with IS_ERR(). If false, the function
2118 * was successful and the return value is the now opened ntfs inode of the
2119 * allocated mft record. *@mrec is then set to the allocated, mapped, pinned,
2120 * and locked mft record. If IS_ERR() is true, the function failed and the
2121 * error code is obtained from PTR_ERR(return value). *@mrec is undefined in
2122 * this case.
2123 *
2124 * Allocation strategy:
2125 *
2126 * To find a free mft record, we scan the mft bitmap for a zero bit. To
2127 * optimize this we start scanning at the place specified by @base_ni or if
2128 * @base_ni is NULL we start where we last stopped and we perform wrap around
2129 * when we reach the end. Note, we do not try to allocate mft records below
2130 * number 24 because numbers 0 to 15 are the defined system files anyway and 16
2131 * to 24 are special in that they are used for storing extension mft records
2132 * for the $DATA attribute of $MFT. This is required to avoid the possibility
2133 * of creating a runlist with a circular dependency which once written to disk
2134 * can never be read in again. Windows will only use records 16 to 24 for
2135 * normal files if the volume is completely out of space. We never use them
2136 * which means that when the volume is really out of space we cannot create any
2137 * more files while Windows can still create up to 8 small files. We can start
2138 * doing this at some later time, it does not matter much for now.
2139 *
2140 * When scanning the mft bitmap, we only search up to the last allocated mft
2141 * record. If there are no free records left in the range 24 to number of
2142 * allocated mft records, then we extend the $MFT/$DATA attribute in order to
2143 * create free mft records. We extend the allocated size of $MFT/$DATA by 16
2144 * records at a time or one cluster, if cluster size is above 16kiB. If there
2145 * is not sufficient space to do this, we try to extend by a single mft record
2146 * or one cluster, if cluster size is above the mft record size.
2147 *
2148 * No matter how many mft records we allocate, we initialize only the first
2149 * allocated mft record, incrementing mft data size and initialized size
2150 * accordingly, open an ntfs_inode for it and return it to the caller, unless
2151 * there are less than 24 mft records, in which case we allocate and initialize
2152 * mft records until we reach record 24 which we consider as the first free mft
2153 * record for use by normal files.
2154 *
2155 * If during any stage we overflow the initialized data in the mft bitmap, we
2156 * extend the initialized size (and data size) by 8 bytes, allocating another
2157 * cluster if required. The bitmap data size has to be at least equal to the
2158 * number of mft records in the mft, but it can be bigger, in which case the
2159 * superflous bits are padded with zeroes.
2160 *
2161 * Thus, when we return successfully (IS_ERR() is false), we will have:
2162 * - initialized / extended the mft bitmap if necessary,
2163 * - initialized / extended the mft data if necessary,
2164 * - set the bit corresponding to the mft record being allocated in the
2165 * mft bitmap,
2166 * - opened an ntfs_inode for the allocated mft record, and we will have
2167 * - returned the ntfs_inode as well as the allocated mapped, pinned, and
2168 * locked mft record.
2169 *
2170 * On error, the volume will be left in a consistent state and no record will
2171 * be allocated. If rolling back a partial operation fails, we may leave some
2172 * inconsistent metadata in which case we set NVolErrors() so the volume is
2173 * left dirty when unmounted.
2174 *
2175 * Note, this function cannot make use of most of the normal functions, like
2176 * for example for attribute resizing, etc, because when the run list overflows
2177 * the base mft record and an attribute list is used, it is very important that
2178 * the extension mft records used to store the $DATA attribute of $MFT can be
2179 * reached without having to read the information contained inside them, as
2180 * this would make it impossible to find them in the first place after the
2181 * volume is unmounted. $MFT/$BITMAP probably does not need to follow this
2182 * rule because the bitmap is not essential for finding the mft records, but on
2183 * the other hand, handling the bitmap in this special way would make life
2184 * easier because otherwise there might be circular invocations of functions
2185 * when reading the bitmap.
2186 */
2187ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
2188 ntfs_inode *base_ni, MFT_RECORD **mrec)
2189{
2190 s64 ll, bit, old_data_initialized, old_data_size;
2191 struct inode *vi;
2192 struct page *page;
2193 ntfs_inode *mft_ni, *mftbmp_ni, *ni;
2194 ntfs_attr_search_ctx *ctx;
2195 MFT_RECORD *m;
2196 ATTR_RECORD *a;
2197 pgoff_t index;
2198 unsigned int ofs;
2199 int err;
2200 le16 seq_no, usn;
2201 BOOL record_formatted = FALSE;
2202
2203 if (base_ni) {
2204 ntfs_debug("Entering (allocating an extent mft record for "
2205 "base mft record 0x%llx).",
2206 (long long)base_ni->mft_no);
2207 /* @mode and @base_ni are mutually exclusive. */
2208 BUG_ON(mode);
2209 } else
2210 ntfs_debug("Entering (allocating a base mft record).");
2211 if (mode) {
2212 /* @mode and @base_ni are mutually exclusive. */
2213 BUG_ON(base_ni);
2214 /* We only support creation of normal files and directories. */
2215 if (!S_ISREG(mode) && !S_ISDIR(mode))
2216 return ERR_PTR(-EOPNOTSUPP);
2217 }
2218 BUG_ON(!mrec);
2219 mft_ni = NTFS_I(vol->mft_ino);
2220 mftbmp_ni = NTFS_I(vol->mftbmp_ino);
2221 down_write(&vol->mftbmp_lock);
2222 bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni);
2223 if (bit >= 0) {
2224 ntfs_debug("Found and allocated free record (#1), bit 0x%llx.",
2225 (long long)bit);
2226 goto have_alloc_rec;
2227 }
2228 if (bit != -ENOSPC) {
2229 up_write(&vol->mftbmp_lock);
2230 return ERR_PTR(bit);
2231 }
2232 /*
2233 * No free mft records left. If the mft bitmap already covers more
2234 * than the currently used mft records, the next records are all free,
2235 * so we can simply allocate the first unused mft record.
2236 * Note: We also have to make sure that the mft bitmap at least covers
2237 * the first 24 mft records as they are special and whilst they may not
2238 * be in use, we do not allocate from them.
2239 */
2240 ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
2241 if (mftbmp_ni->initialized_size << 3 > ll &&
2242 mftbmp_ni->initialized_size > 3) {
2243 bit = ll;
2244 if (bit < 24)
2245 bit = 24;
2246 if (unlikely(bit >= (1ll << 32)))
2247 goto max_err_out;
2248 ntfs_debug("Found free record (#2), bit 0x%llx.",
2249 (long long)bit);
2250 goto found_free_rec;
2251 }
2252 /*
2253 * The mft bitmap needs to be expanded until it covers the first unused
2254 * mft record that we can allocate.
2255 * Note: The smallest mft record we allocate is mft record 24.
2256 */
2257 bit = mftbmp_ni->initialized_size << 3;
2258 if (unlikely(bit >= (1ll << 32)))
2259 goto max_err_out;
2260 ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, "
2261 "data_size 0x%llx, initialized_size 0x%llx.",
2262 (long long)mftbmp_ni->allocated_size,
2263 (long long)vol->mftbmp_ino->i_size,
2264 (long long)mftbmp_ni->initialized_size);
2265 if (mftbmp_ni->initialized_size + 8 > mftbmp_ni->allocated_size) {
2266 /* Need to extend bitmap by one more cluster. */
2267 ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
2268 err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
2269 if (unlikely(err)) {
2270 up_write(&vol->mftbmp_lock);
2271 goto err_out;
2272 }
2273 ntfs_debug("Status of mftbmp after allocation extension: "
2274 "allocated_size 0x%llx, data_size 0x%llx, "
2275 "initialized_size 0x%llx.",
2276 (long long)mftbmp_ni->allocated_size,
2277 (long long)vol->mftbmp_ino->i_size,
2278 (long long)mftbmp_ni->initialized_size);
2279 }
2280 /*
2281 * We now have sufficient allocated space, extend the initialized_size
2282 * as well as the data_size if necessary and fill the new space with
2283 * zeroes.
2284 */
2285 err = ntfs_mft_bitmap_extend_initialized_nolock(vol);
2286 if (unlikely(err)) {
2287 up_write(&vol->mftbmp_lock);
2288 goto err_out;
2289 }
2290 ntfs_debug("Status of mftbmp after initialized extention: "
2291 "allocated_size 0x%llx, data_size 0x%llx, "
2292 "initialized_size 0x%llx.",
2293 (long long)mftbmp_ni->allocated_size,
2294 (long long)vol->mftbmp_ino->i_size,
2295 (long long)mftbmp_ni->initialized_size);
2296 ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
2297found_free_rec:
2298 /* @bit is the found free mft record, allocate it in the mft bitmap. */
2299 ntfs_debug("At found_free_rec.");
2300 err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit);
2301 if (unlikely(err)) {
2302 ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap.");
2303 up_write(&vol->mftbmp_lock);
2304 goto err_out;
2305 }
2306 ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit);
2307have_alloc_rec:
2308 /*
2309 * The mft bitmap is now uptodate. Deal with mft data attribute now.
2310 * Note, we keep hold of the mft bitmap lock for writing until all
2311 * modifications to the mft data attribute are complete, too, as they
2312 * will impact decisions for mft bitmap and mft record allocation done
2313 * by a parallel allocation and if the lock is not maintained a
2314 * parallel allocation could allocate the same mft record as this one.
2315 */
2316 ll = (bit + 1) << vol->mft_record_size_bits;
2317 if (ll <= mft_ni->initialized_size) {
2318 ntfs_debug("Allocated mft record already initialized.");
2319 goto mft_rec_already_initialized;
2320 }
2321 ntfs_debug("Initializing allocated mft record.");
2322 /*
2323 * The mft record is outside the initialized data. Extend the mft data
2324 * attribute until it covers the allocated record. The loop is only
2325 * actually traversed more than once when a freshly formatted volume is
2326 * first written to so it optimizes away nicely in the common case.
2327 */
2328 ntfs_debug("Status of mft data before extension: "
2329 "allocated_size 0x%llx, data_size 0x%llx, "
2330 "initialized_size 0x%llx.",
2331 (long long)mft_ni->allocated_size,
2332 (long long)vol->mft_ino->i_size,
2333 (long long)mft_ni->initialized_size);
2334 while (ll > mft_ni->allocated_size) {
2335 err = ntfs_mft_data_extend_allocation_nolock(vol);
2336 if (unlikely(err)) {
2337 ntfs_error(vol->sb, "Failed to extend mft data "
2338 "allocation.");
2339 goto undo_mftbmp_alloc_nolock;
2340 }
2341 ntfs_debug("Status of mft data after allocation extension: "
2342 "allocated_size 0x%llx, data_size 0x%llx, "
2343 "initialized_size 0x%llx.",
2344 (long long)mft_ni->allocated_size,
2345 (long long)vol->mft_ino->i_size,
2346 (long long)mft_ni->initialized_size);
2347 }
2348 /*
2349 * Extend mft data initialized size (and data size of course) to reach
2350 * the allocated mft record, formatting the mft records allong the way.
2351 * Note: We only modify the ntfs_inode structure as that is all that is
2352 * needed by ntfs_mft_record_format(). We will update the attribute
2353 * record itself in one fell swoop later on.
2354 */
2355 old_data_initialized = mft_ni->initialized_size;
2356 old_data_size = vol->mft_ino->i_size;
2357 while (ll > mft_ni->initialized_size) {
2358 s64 new_initialized_size, mft_no;
2359
2360 new_initialized_size = mft_ni->initialized_size +
2361 vol->mft_record_size;
2362 mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
2363 if (new_initialized_size > vol->mft_ino->i_size)
2364 vol->mft_ino->i_size = new_initialized_size;
2365 ntfs_debug("Initializing mft record 0x%llx.",
2366 (long long)mft_no);
2367 err = ntfs_mft_record_format(vol, mft_no);
2368 if (unlikely(err)) {
2369 ntfs_error(vol->sb, "Failed to format mft record.");
2370 goto undo_data_init;
2371 }
2372 mft_ni->initialized_size = new_initialized_size;
2373 }
2374 record_formatted = TRUE;
2375 /* Update the mft data attribute record to reflect the new sizes. */
2376 m = map_mft_record(mft_ni);
2377 if (IS_ERR(m)) {
2378 ntfs_error(vol->sb, "Failed to map mft record.");
2379 err = PTR_ERR(m);
2380 goto undo_data_init;
2381 }
2382 ctx = ntfs_attr_get_search_ctx(mft_ni, m);
2383 if (unlikely(!ctx)) {
2384 ntfs_error(vol->sb, "Failed to get search context.");
2385 err = -ENOMEM;
2386 unmap_mft_record(mft_ni);
2387 goto undo_data_init;
2388 }
2389 err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
2390 CASE_SENSITIVE, 0, NULL, 0, ctx);
2391 if (unlikely(err)) {
2392 ntfs_error(vol->sb, "Failed to find first attribute extent of "
2393 "mft data attribute.");
2394 ntfs_attr_put_search_ctx(ctx);
2395 unmap_mft_record(mft_ni);
2396 goto undo_data_init;
2397 }
2398 a = ctx->attr;
2399 a->data.non_resident.initialized_size =
2400 cpu_to_sle64(mft_ni->initialized_size);
2401 a->data.non_resident.data_size = cpu_to_sle64(vol->mft_ino->i_size);
2402 /* Ensure the changes make it to disk. */
2403 flush_dcache_mft_record_page(ctx->ntfs_ino);
2404 mark_mft_record_dirty(ctx->ntfs_ino);
2405 ntfs_attr_put_search_ctx(ctx);
2406 unmap_mft_record(mft_ni);
2407 ntfs_debug("Status of mft data after mft record initialization: "
2408 "allocated_size 0x%llx, data_size 0x%llx, "
2409 "initialized_size 0x%llx.",
2410 (long long)mft_ni->allocated_size,
2411 (long long)vol->mft_ino->i_size,
2412 (long long)mft_ni->initialized_size);
2413 BUG_ON(vol->mft_ino->i_size > mft_ni->allocated_size);
2414 BUG_ON(mft_ni->initialized_size > vol->mft_ino->i_size);
2415mft_rec_already_initialized:
2416 /*
2417 * We can finally drop the mft bitmap lock as the mft data attribute
2418 * has been fully updated. The only disparity left is that the
2419 * allocated mft record still needs to be marked as in use to match the
2420 * set bit in the mft bitmap but this is actually not a problem since
2421 * this mft record is not referenced from anywhere yet and the fact
2422 * that it is allocated in the mft bitmap means that no-one will try to
2423 * allocate it either.
2424 */
2425 up_write(&vol->mftbmp_lock);
2426 /*
2427 * We now have allocated and initialized the mft record. Calculate the
2428 * index of and the offset within the page cache page the record is in.
2429 */
2430 index = bit << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
2431 ofs = (bit << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
2432 /* Read, map, and pin the page containing the mft record. */
2433 page = ntfs_map_page(vol->mft_ino->i_mapping, index);
2434 if (unlikely(IS_ERR(page))) {
2435 ntfs_error(vol->sb, "Failed to map page containing allocated "
2436 "mft record 0x%llx.", (long long)bit);
2437 err = PTR_ERR(page);
2438 goto undo_mftbmp_alloc;
2439 }
2440 lock_page(page);
2441 BUG_ON(!PageUptodate(page));
2442 ClearPageUptodate(page);
2443 m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
2444 /* If we just formatted the mft record no need to do it again. */
2445 if (!record_formatted) {
2446 /* Sanity check that the mft record is really not in use. */
2447 if (ntfs_is_file_record(m->magic) &&
2448 (m->flags & MFT_RECORD_IN_USE)) {
2449 ntfs_error(vol->sb, "Mft record 0x%llx was marked "
2450 "free in mft bitmap but is marked "
2451 "used itself. Corrupt filesystem. "
2452 "Unmount and run chkdsk.",
2453 (long long)bit);
2454 err = -EIO;
2455 SetPageUptodate(page);
2456 unlock_page(page);
2457 ntfs_unmap_page(page);
2458 NVolSetErrors(vol);
2459 goto undo_mftbmp_alloc;
2460 }
2461 /*
2462 * We need to (re-)format the mft record, preserving the
2463 * sequence number if it is not zero as well as the update
2464 * sequence number if it is not zero or -1 (0xffff). This
2465 * means we do not need to care whether or not something went
2466 * wrong with the previous mft record.
2467 */
2468 seq_no = m->sequence_number;
2469 usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs));
2470 err = ntfs_mft_record_layout(vol, bit, m);
2471 if (unlikely(err)) {
2472 ntfs_error(vol->sb, "Failed to layout allocated mft "
2473 "record 0x%llx.", (long long)bit);
2474 SetPageUptodate(page);
2475 unlock_page(page);
2476 ntfs_unmap_page(page);
2477 goto undo_mftbmp_alloc;
2478 }
2479 if (seq_no)
2480 m->sequence_number = seq_no;
2481 if (usn && le16_to_cpu(usn) != 0xffff)
2482 *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn;
2483 }
2484 /* Set the mft record itself in use. */
2485 m->flags |= MFT_RECORD_IN_USE;
2486 if (S_ISDIR(mode))
2487 m->flags |= MFT_RECORD_IS_DIRECTORY;
2488 flush_dcache_page(page);
2489 SetPageUptodate(page);
2490 if (base_ni) {
2491 /*
2492 * Setup the base mft record in the extent mft record. This
2493 * completes initialization of the allocated extent mft record
2494 * and we can simply use it with map_extent_mft_record().
2495 */
2496 m->base_mft_record = MK_LE_MREF(base_ni->mft_no,
2497 base_ni->seq_no);
2498 /*
2499 * Allocate an extent inode structure for the new mft record,
2500 * attach it to the base inode @base_ni and map, pin, and lock
2501 * its, i.e. the allocated, mft record.
2502 */
2503 m = map_extent_mft_record(base_ni, bit, &ni);
2504 if (IS_ERR(m)) {
2505 ntfs_error(vol->sb, "Failed to map allocated extent "
2506 "mft record 0x%llx.", (long long)bit);
2507 err = PTR_ERR(m);
2508 /* Set the mft record itself not in use. */
2509 m->flags &= cpu_to_le16(
2510 ~le16_to_cpu(MFT_RECORD_IN_USE));
2511 flush_dcache_page(page);
2512 /* Make sure the mft record is written out to disk. */
2513 mark_ntfs_record_dirty(page, ofs);
2514 unlock_page(page);
2515 ntfs_unmap_page(page);
2516 goto undo_mftbmp_alloc;
2517 }
2518 /*
2519 * Make sure the allocated mft record is written out to disk.
2520 * No need to set the inode dirty because the caller is going
2521 * to do that anyway after finishing with the new extent mft
2522 * record (e.g. at a minimum a new attribute will be added to
2523 * the mft record.
2524 */
2525 mark_ntfs_record_dirty(page, ofs);
2526 unlock_page(page);
2527 /*
2528 * Need to unmap the page since map_extent_mft_record() mapped
2529 * it as well so we have it mapped twice at the moment.
2530 */
2531 ntfs_unmap_page(page);
2532 } else {
2533 /*
2534 * Allocate a new VFS inode and set it up. NOTE: @vi->i_nlink
2535 * is set to 1 but the mft record->link_count is 0. The caller
2536 * needs to bear this in mind.
2537 */
2538 vi = new_inode(vol->sb);
2539 if (unlikely(!vi)) {
2540 err = -ENOMEM;
2541 /* Set the mft record itself not in use. */
2542 m->flags &= cpu_to_le16(
2543 ~le16_to_cpu(MFT_RECORD_IN_USE));
2544 flush_dcache_page(page);
2545 /* Make sure the mft record is written out to disk. */
2546 mark_ntfs_record_dirty(page, ofs);
2547 unlock_page(page);
2548 ntfs_unmap_page(page);
2549 goto undo_mftbmp_alloc;
2550 }
2551 vi->i_ino = bit;
2552 /*
2553 * This is the optimal IO size (for stat), not the fs block
2554 * size.
2555 */
2556 vi->i_blksize = PAGE_CACHE_SIZE;
2557 /*
2558 * This is for checking whether an inode has changed w.r.t. a
2559 * file so that the file can be updated if necessary (compare
2560 * with f_version).
2561 */
2562 vi->i_version = 1;
2563
2564 /* The owner and group come from the ntfs volume. */
2565 vi->i_uid = vol->uid;
2566 vi->i_gid = vol->gid;
2567
2568 /* Initialize the ntfs specific part of @vi. */
2569 ntfs_init_big_inode(vi);
2570 ni = NTFS_I(vi);
2571 /*
2572 * Set the appropriate mode, attribute type, and name. For
2573 * directories, also setup the index values to the defaults.
2574 */
2575 if (S_ISDIR(mode)) {
2576 vi->i_mode = S_IFDIR | S_IRWXUGO;
2577 vi->i_mode &= ~vol->dmask;
2578
2579 NInoSetMstProtected(ni);
2580 ni->type = AT_INDEX_ALLOCATION;
2581 ni->name = I30;
2582 ni->name_len = 4;
2583
2584 ni->itype.index.block_size = 4096;
2585 ni->itype.index.block_size_bits = generic_ffs(4096) - 1;
2586 ni->itype.index.collation_rule = COLLATION_FILE_NAME;
2587 if (vol->cluster_size <= ni->itype.index.block_size) {
2588 ni->itype.index.vcn_size = vol->cluster_size;
2589 ni->itype.index.vcn_size_bits =
2590 vol->cluster_size_bits;
2591 } else {
2592 ni->itype.index.vcn_size = vol->sector_size;
2593 ni->itype.index.vcn_size_bits =
2594 vol->sector_size_bits;
2595 }
2596 } else {
2597 vi->i_mode = S_IFREG | S_IRWXUGO;
2598 vi->i_mode &= ~vol->fmask;
2599
2600 ni->type = AT_DATA;
2601 ni->name = NULL;
2602 ni->name_len = 0;
2603 }
2604 if (IS_RDONLY(vi))
2605 vi->i_mode &= ~S_IWUGO;
2606
2607 /* Set the inode times to the current time. */
2608 vi->i_atime = vi->i_mtime = vi->i_ctime =
2609 current_fs_time(vi->i_sb);
2610 /*
2611 * Set the file size to 0, the ntfs inode sizes are set to 0 by
2612 * the call to ntfs_init_big_inode() below.
2613 */
2614 vi->i_size = 0;
2615 vi->i_blocks = 0;
2616
2617 /* Set the sequence number. */
2618 vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
2619 /*
2620 * Manually map, pin, and lock the mft record as we already
2621 * have its page mapped and it is very easy to do.
2622 */
2623 atomic_inc(&ni->count);
2624 down(&ni->mrec_lock);
2625 ni->page = page;
2626 ni->page_ofs = ofs;
2627 /*
2628 * Make sure the allocated mft record is written out to disk.
2629 * NOTE: We do not set the ntfs inode dirty because this would
2630 * fail in ntfs_write_inode() because the inode does not have a
2631 * standard information attribute yet. Also, there is no need
2632 * to set the inode dirty because the caller is going to do
2633 * that anyway after finishing with the new mft record (e.g. at
2634 * a minimum some new attributes will be added to the mft
2635 * record.
2636 */
2637 mark_ntfs_record_dirty(page, ofs);
2638 unlock_page(page);
2639
2640 /* Add the inode to the inode hash for the superblock. */
2641 insert_inode_hash(vi);
2642
2643 /* Update the default mft allocation position. */
2644 vol->mft_data_pos = bit + 1;
2645 }
2646 /*
2647 * Return the opened, allocated inode of the allocated mft record as
2648 * well as the mapped, pinned, and locked mft record.
2649 */
2650 ntfs_debug("Returning opened, allocated %sinode 0x%llx.",
2651 base_ni ? "extent " : "", (long long)bit);
2652 *mrec = m;
2653 return ni;
2654undo_data_init:
2655 mft_ni->initialized_size = old_data_initialized;
2656 vol->mft_ino->i_size = old_data_size;
2657 goto undo_mftbmp_alloc_nolock;
2658undo_mftbmp_alloc:
2659 down_write(&vol->mftbmp_lock);
2660undo_mftbmp_alloc_nolock:
2661 if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) {
2662 ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
2663 NVolSetErrors(vol);
2664 }
2665 up_write(&vol->mftbmp_lock);
2666err_out:
2667 return ERR_PTR(err);
2668max_err_out:
2669 ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum "
2670 "number of inodes (2^32) has already been reached.");
2671 up_write(&vol->mftbmp_lock);
2672 return ERR_PTR(-ENOSPC);
2673}
2674
2675/**
2676 * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume
2677 * @ni: ntfs inode of the mapped extent mft record to free
2678 * @m: mapped extent mft record of the ntfs inode @ni
2679 *
2680 * Free the mapped extent mft record @m of the extent ntfs inode @ni.
2681 *
2682 * Note that this function unmaps the mft record and closes and destroys @ni
2683 * internally and hence you cannot use either @ni nor @m any more after this
2684 * function returns success.
2685 *
2686 * On success return 0 and on error return -errno. @ni and @m are still valid
2687 * in this case and have not been freed.
2688 *
2689 * For some errors an error message is displayed and the success code 0 is
2690 * returned and the volume is then left dirty on umount. This makes sense in
2691 * case we could not rollback the changes that were already done since the
2692 * caller no longer wants to reference this mft record so it does not matter to
2693 * the caller if something is wrong with it as long as it is properly detached
2694 * from the base inode.
2695 */
2696int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
2697{
2698 unsigned long mft_no = ni->mft_no;
2699 ntfs_volume *vol = ni->vol;
2700 ntfs_inode *base_ni;
2701 ntfs_inode **extent_nis;
2702 int i, err;
2703 le16 old_seq_no;
2704 u16 seq_no;
2705
2706 BUG_ON(NInoAttr(ni));
2707 BUG_ON(ni->nr_extents != -1);
2708
2709 down(&ni->extent_lock);
2710 base_ni = ni->ext.base_ntfs_ino;
2711 up(&ni->extent_lock);
2712
2713 BUG_ON(base_ni->nr_extents <= 0);
2714
2715 ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n",
2716 mft_no, base_ni->mft_no);
2717
2718 down(&base_ni->extent_lock);
2719
2720 /* Make sure we are holding the only reference to the extent inode. */
2721 if (atomic_read(&ni->count) > 2) {
2722 ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, "
2723 "not freeing.", base_ni->mft_no);
2724 up(&base_ni->extent_lock);
2725 return -EBUSY;
2726 }
2727
2728 /* Dissociate the ntfs inode from the base inode. */
2729 extent_nis = base_ni->ext.extent_ntfs_inos;
2730 err = -ENOENT;
2731 for (i = 0; i < base_ni->nr_extents; i++) {
2732 if (ni != extent_nis[i])
2733 continue;
2734 extent_nis += i;
2735 base_ni->nr_extents--;
2736 memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) *
2737 sizeof(ntfs_inode*));
2738 err = 0;
2739 break;
2740 }
2741
2742 up(&base_ni->extent_lock);
2743
2744 if (unlikely(err)) {
2745 ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to "
2746 "its base inode 0x%lx.", mft_no,
2747 base_ni->mft_no);
2748 BUG();
2749 }
2750
2751 /*
2752 * The extent inode is no longer attached to the base inode so no one
2753 * can get a reference to it any more.
2754 */
2755
2756 /* Mark the mft record as not in use. */
2757 m->flags &= const_cpu_to_le16(~const_le16_to_cpu(MFT_RECORD_IN_USE));
2758
2759 /* Increment the sequence number, skipping zero, if it is not zero. */
2760 old_seq_no = m->sequence_number;
2761 seq_no = le16_to_cpu(old_seq_no);
2762 if (seq_no == 0xffff)
2763 seq_no = 1;
2764 else if (seq_no)
2765 seq_no++;
2766 m->sequence_number = cpu_to_le16(seq_no);
2767
2768 /*
2769 * Set the ntfs inode dirty and write it out. We do not need to worry
2770 * about the base inode here since whatever caused the extent mft
2771 * record to be freed is guaranteed to do it already.
2772 */
2773 NInoSetDirty(ni);
2774 err = write_mft_record(ni, m, 0);
2775 if (unlikely(err)) {
2776 ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not "
2777 "freeing.", mft_no);
2778 goto rollback;
2779 }
2780rollback_error:
2781 /* Unmap and throw away the now freed extent inode. */
2782 unmap_extent_mft_record(ni);
2783 ntfs_clear_extent_inode(ni);
2784
2785 /* Clear the bit in the $MFT/$BITMAP corresponding to this record. */
2786 down_write(&vol->mftbmp_lock);
2787 err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no);
2788 up_write(&vol->mftbmp_lock);
2789 if (unlikely(err)) {
2790 /*
2791 * The extent inode is gone but we failed to deallocate it in
2792 * the mft bitmap. Just emit a warning and leave the volume
2793 * dirty on umount.
2794 */
2795 ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
2796 NVolSetErrors(vol);
2797 }
2798 return 0;
2799rollback:
2800 /* Rollback what we did... */
2801 down(&base_ni->extent_lock);
2802 extent_nis = base_ni->ext.extent_ntfs_inos;
2803 if (!(base_ni->nr_extents & 3)) {
2804 int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*);
2805
2806 extent_nis = (ntfs_inode**)kmalloc(new_size, GFP_NOFS);
2807 if (unlikely(!extent_nis)) {
2808 ntfs_error(vol->sb, "Failed to allocate internal "
2809 "buffer during rollback.%s", es);
2810 up(&base_ni->extent_lock);
2811 NVolSetErrors(vol);
2812 goto rollback_error;
2813 }
2814 if (base_ni->nr_extents) {
2815 BUG_ON(!base_ni->ext.extent_ntfs_inos);
2816 memcpy(extent_nis, base_ni->ext.extent_ntfs_inos,
2817 new_size - 4 * sizeof(ntfs_inode*));
2818 kfree(base_ni->ext.extent_ntfs_inos);
2819 }
2820 base_ni->ext.extent_ntfs_inos = extent_nis;
2821 }
2822 m->flags |= MFT_RECORD_IN_USE;
2823 m->sequence_number = old_seq_no;
2824 extent_nis[base_ni->nr_extents++] = ni;
2825 up(&base_ni->extent_lock);
2826 mark_mft_record_dirty(ni);
2827 return err;
2828}
2829#endif /* NTFS_RW */
diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h
new file mode 100644
index 000000000000..407de2cef1d6
--- /dev/null
+++ b/fs/ntfs/mft.h
@@ -0,0 +1,127 @@
1/*
2 * mft.h - Defines for mft record handling in NTFS Linux kernel driver.
3 * Part of the Linux-NTFS project.
4 *
5 * Copyright (c) 2001-2004 Anton Altaparmakov
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#ifndef _LINUX_NTFS_MFT_H
24#define _LINUX_NTFS_MFT_H
25
26#include <linux/fs.h>
27#include <linux/highmem.h>
28#include <linux/pagemap.h>
29
30#include "inode.h"
31
32extern MFT_RECORD *map_mft_record(ntfs_inode *ni);
33extern void unmap_mft_record(ntfs_inode *ni);
34
35extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
36 ntfs_inode **ntfs_ino);
37
38static inline void unmap_extent_mft_record(ntfs_inode *ni)
39{
40 unmap_mft_record(ni);
41 return;
42}
43
44#ifdef NTFS_RW
45
46/**
47 * flush_dcache_mft_record_page - flush_dcache_page() for mft records
48 * @ni: ntfs inode structure of mft record
49 *
50 * Call flush_dcache_page() for the page in which an mft record resides.
51 *
52 * This must be called every time an mft record is modified, just after the
53 * modification.
54 */
55static inline void flush_dcache_mft_record_page(ntfs_inode *ni)
56{
57 flush_dcache_page(ni->page);
58}
59
60extern void __mark_mft_record_dirty(ntfs_inode *ni);
61
62/**
63 * mark_mft_record_dirty - set the mft record and the page containing it dirty
64 * @ni: ntfs inode describing the mapped mft record
65 *
66 * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
67 * as well as the page containing the mft record, dirty. Also, mark the base
68 * vfs inode dirty. This ensures that any changes to the mft record are
69 * written out to disk.
70 *
71 * NOTE: Do not do anything if the mft record is already marked dirty.
72 */
73static inline void mark_mft_record_dirty(ntfs_inode *ni)
74{
75 if (!NInoTestSetDirty(ni))
76 __mark_mft_record_dirty(ni);
77}
78
79extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
80 MFT_RECORD *m, int sync);
81
82extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync);
83
84/**
85 * write_mft_record - write out a mapped (extent) mft record
86 * @ni: ntfs inode describing the mapped (extent) mft record
87 * @m: mapped (extent) mft record to write
88 * @sync: if true, wait for i/o completion
89 *
90 * This is just a wrapper for write_mft_record_nolock() (see mft.c), which
91 * locks the page for the duration of the write. This ensures that there are
92 * no race conditions between writing the mft record via the dirty inode code
93 * paths and via the page cache write back code paths or between writing
94 * neighbouring mft records residing in the same page.
95 *
96 * Locking the page also serializes us against ->readpage() if the page is not
97 * uptodate.
98 *
99 * On success, clean the mft record and return 0. On error, leave the mft
100 * record dirty and return -errno. The caller should call make_bad_inode() on
101 * the base inode to ensure no more access happens to this inode. We do not do
102 * it here as the caller may want to finish writing other extent mft records
103 * first to minimize on-disk metadata inconsistencies.
104 */
105static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync)
106{
107 struct page *page = ni->page;
108 int err;
109
110 BUG_ON(!page);
111 lock_page(page);
112 err = write_mft_record_nolock(ni, m, sync);
113 unlock_page(page);
114 return err;
115}
116
117extern BOOL ntfs_may_write_mft_record(ntfs_volume *vol,
118 const unsigned long mft_no, const MFT_RECORD *m,
119 ntfs_inode **locked_ni);
120
121extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
122 ntfs_inode *base_ni, MFT_RECORD **mrec);
123extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m);
124
125#endif /* NTFS_RW */
126
127#endif /* _LINUX_NTFS_MFT_H */
diff --git a/fs/ntfs/mst.c b/fs/ntfs/mst.c
new file mode 100644
index 000000000000..5a858d839b65
--- /dev/null
+++ b/fs/ntfs/mst.c
@@ -0,0 +1,203 @@
1/*
2 * mst.c - NTFS multi sector transfer protection handling code. Part of the
3 * Linux-NTFS project.
4 *
5 * Copyright (c) 2001-2004 Anton Altaparmakov
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#include "ntfs.h"
24
25/**
26 * post_read_mst_fixup - deprotect multi sector transfer protected data
27 * @b: pointer to the data to deprotect
28 * @size: size in bytes of @b
29 *
30 * Perform the necessary post read multi sector transfer fixup and detect the
31 * presence of incomplete multi sector transfers. - In that case, overwrite the
32 * magic of the ntfs record header being processed with "BAAD" (in memory only!)
33 * and abort processing.
34 *
35 * Return 0 on success and -EINVAL on error ("BAAD" magic will be present).
36 *
37 * NOTE: We consider the absence / invalidity of an update sequence array to
38 * mean that the structure is not protected at all and hence doesn't need to
39 * be fixed up. Thus, we return success and not failure in this case. This is
40 * in contrast to pre_write_mst_fixup(), see below.
41 */
42int post_read_mst_fixup(NTFS_RECORD *b, const u32 size)
43{
44 u16 usa_ofs, usa_count, usn;
45 u16 *usa_pos, *data_pos;
46
47 /* Setup the variables. */
48 usa_ofs = le16_to_cpu(b->usa_ofs);
49 /* Decrement usa_count to get number of fixups. */
50 usa_count = le16_to_cpu(b->usa_count) - 1;
51 /* Size and alignment checks. */
52 if ( size & (NTFS_BLOCK_SIZE - 1) ||
53 usa_ofs & 1 ||
54 usa_ofs + (usa_count * 2) > size ||
55 (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
56 return 0;
57 /* Position of usn in update sequence array. */
58 usa_pos = (u16*)b + usa_ofs/sizeof(u16);
59 /*
60 * The update sequence number which has to be equal to each of the
61 * u16 values before they are fixed up. Note no need to care for
62 * endianness since we are comparing and moving data for on disk
63 * structures which means the data is consistent. - If it is
64 * consistenty the wrong endianness it doesn't make any difference.
65 */
66 usn = *usa_pos;
67 /*
68 * Position in protected data of first u16 that needs fixing up.
69 */
70 data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
71 /*
72 * Check for incomplete multi sector transfer(s).
73 */
74 while (usa_count--) {
75 if (*data_pos != usn) {
76 /*
77 * Incomplete multi sector transfer detected! )-:
78 * Set the magic to "BAAD" and return failure.
79 * Note that magic_BAAD is already converted to le32.
80 */
81 b->magic = magic_BAAD;
82 return -EINVAL;
83 }
84 data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
85 }
86 /* Re-setup the variables. */
87 usa_count = le16_to_cpu(b->usa_count) - 1;
88 data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
89 /* Fixup all sectors. */
90 while (usa_count--) {
91 /*
92 * Increment position in usa and restore original data from
93 * the usa into the data buffer.
94 */
95 *data_pos = *(++usa_pos);
96 /* Increment position in data as well. */
97 data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
98 }
99 return 0;
100}
101
102/**
103 * pre_write_mst_fixup - apply multi sector transfer protection
104 * @b: pointer to the data to protect
105 * @size: size in bytes of @b
106 *
107 * Perform the necessary pre write multi sector transfer fixup on the data
108 * pointer to by @b of @size.
109 *
110 * Return 0 if fixup applied (success) or -EINVAL if no fixup was performed
111 * (assumed not needed). This is in contrast to post_read_mst_fixup() above.
112 *
113 * NOTE: We consider the absence / invalidity of an update sequence array to
114 * mean that the structure is not subject to protection and hence doesn't need
115 * to be fixed up. This means that you have to create a valid update sequence
116 * array header in the ntfs record before calling this function, otherwise it
117 * will fail (the header needs to contain the position of the update sequence
118 * array together with the number of elements in the array). You also need to
119 * initialise the update sequence number before calling this function
120 * otherwise a random word will be used (whatever was in the record at that
121 * position at that time).
122 */
123int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size)
124{
125 le16 *usa_pos, *data_pos;
126 u16 usa_ofs, usa_count, usn;
127 le16 le_usn;
128
129 /* Sanity check + only fixup if it makes sense. */
130 if (!b || ntfs_is_baad_record(b->magic) ||
131 ntfs_is_hole_record(b->magic))
132 return -EINVAL;
133 /* Setup the variables. */
134 usa_ofs = le16_to_cpu(b->usa_ofs);
135 /* Decrement usa_count to get number of fixups. */
136 usa_count = le16_to_cpu(b->usa_count) - 1;
137 /* Size and alignment checks. */
138 if ( size & (NTFS_BLOCK_SIZE - 1) ||
139 usa_ofs & 1 ||
140 usa_ofs + (usa_count * 2) > size ||
141 (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
142 return -EINVAL;
143 /* Position of usn in update sequence array. */
144 usa_pos = (le16*)((u8*)b + usa_ofs);
145 /*
146 * Cyclically increment the update sequence number
147 * (skipping 0 and -1, i.e. 0xffff).
148 */
149 usn = le16_to_cpup(usa_pos) + 1;
150 if (usn == 0xffff || !usn)
151 usn = 1;
152 le_usn = cpu_to_le16(usn);
153 *usa_pos = le_usn;
154 /* Position in data of first u16 that needs fixing up. */
155 data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
156 /* Fixup all sectors. */
157 while (usa_count--) {
158 /*
159 * Increment the position in the usa and save the
160 * original data from the data buffer into the usa.
161 */
162 *(++usa_pos) = *data_pos;
163 /* Apply fixup to data. */
164 *data_pos = le_usn;
165 /* Increment position in data as well. */
166 data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
167 }
168 return 0;
169}
170
171/**
172 * post_write_mst_fixup - fast deprotect multi sector transfer protected data
173 * @b: pointer to the data to deprotect
174 *
175 * Perform the necessary post write multi sector transfer fixup, not checking
176 * for any errors, because we assume we have just used pre_write_mst_fixup(),
177 * thus the data will be fine or we would never have gotten here.
178 */
179void post_write_mst_fixup(NTFS_RECORD *b)
180{
181 le16 *usa_pos, *data_pos;
182
183 u16 usa_ofs = le16_to_cpu(b->usa_ofs);
184 u16 usa_count = le16_to_cpu(b->usa_count) - 1;
185
186 /* Position of usn in update sequence array. */
187 usa_pos = (le16*)b + usa_ofs/sizeof(le16);
188
189 /* Position in protected data of first u16 that needs fixing up. */
190 data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
191
192 /* Fixup all sectors. */
193 while (usa_count--) {
194 /*
195 * Increment position in usa and restore original data from
196 * the usa into the data buffer.
197 */
198 *data_pos = *(++usa_pos);
199
200 /* Increment position in data as well. */
201 data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
202 }
203}
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
new file mode 100644
index 000000000000..7c7e13b43b2e
--- /dev/null
+++ b/fs/ntfs/namei.c
@@ -0,0 +1,498 @@
1/*
2 * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS
3 * project.
4 *
5 * Copyright (c) 2001-2004 Anton Altaparmakov
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#include <linux/dcache.h>
24#include <linux/security.h>
25
26#include "attrib.h"
27#include "debug.h"
28#include "dir.h"
29#include "mft.h"
30#include "ntfs.h"
31
32/**
33 * ntfs_lookup - find the inode represented by a dentry in a directory inode
34 * @dir_ino: directory inode in which to look for the inode
35 * @dent: dentry representing the inode to look for
36 * @nd: lookup nameidata
37 *
38 * In short, ntfs_lookup() looks for the inode represented by the dentry @dent
39 * in the directory inode @dir_ino and if found attaches the inode to the
40 * dentry @dent.
41 *
42 * In more detail, the dentry @dent specifies which inode to look for by
43 * supplying the name of the inode in @dent->d_name.name. ntfs_lookup()
44 * converts the name to Unicode and walks the contents of the directory inode
45 * @dir_ino looking for the converted Unicode name. If the name is found in the
46 * directory, the corresponding inode is loaded by calling ntfs_iget() on its
47 * inode number and the inode is associated with the dentry @dent via a call to
48 * d_splice_alias().
49 *
50 * If the name is not found in the directory, a NULL inode is inserted into the
51 * dentry @dent via a call to d_add(). The dentry is then termed a negative
52 * dentry.
53 *
54 * Only if an actual error occurs, do we return an error via ERR_PTR().
55 *
56 * In order to handle the case insensitivity issues of NTFS with regards to the
57 * dcache and the dcache requiring only one dentry per directory, we deal with
58 * dentry aliases that only differ in case in ->ntfs_lookup() while maintaining
59 * a case sensitive dcache. This means that we get the full benefit of dcache
60 * speed when the file/directory is looked up with the same case as returned by
61 * ->ntfs_readdir() but that a lookup for any other case (or for the short file
62 * name) will not find anything in dcache and will enter ->ntfs_lookup()
63 * instead, where we search the directory for a fully matching file name
64 * (including case) and if that is not found, we search for a file name that
65 * matches with different case and if that has non-POSIX semantics we return
66 * that. We actually do only one search (case sensitive) and keep tabs on
67 * whether we have found a case insensitive match in the process.
68 *
69 * To simplify matters for us, we do not treat the short vs long filenames as
70 * two hard links but instead if the lookup matches a short filename, we
71 * return the dentry for the corresponding long filename instead.
72 *
73 * There are three cases we need to distinguish here:
74 *
75 * 1) @dent perfectly matches (i.e. including case) a directory entry with a
76 * file name in the WIN32 or POSIX namespaces. In this case
77 * ntfs_lookup_inode_by_name() will return with name set to NULL and we
78 * just d_splice_alias() @dent.
79 * 2) @dent matches (not including case) a directory entry with a file name in
80 * the WIN32 namespace. In this case ntfs_lookup_inode_by_name() will return
81 * with name set to point to a kmalloc()ed ntfs_name structure containing
82 * the properly cased little endian Unicode name. We convert the name to the
83 * current NLS code page, search if a dentry with this name already exists
84 * and if so return that instead of @dent. At this point things are
85 * complicated by the possibility of 'disconnected' dentries due to NFS
86 * which we deal with appropriately (see the code comments). The VFS will
87 * then destroy the old @dent and use the one we returned. If a dentry is
88 * not found, we allocate a new one, d_splice_alias() it, and return it as
89 * above.
90 * 3) @dent matches either perfectly or not (i.e. we don't care about case) a
91 * directory entry with a file name in the DOS namespace. In this case
92 * ntfs_lookup_inode_by_name() will return with name set to point to a
93 * kmalloc()ed ntfs_name structure containing the mft reference (cpu endian)
94 * of the inode. We use the mft reference to read the inode and to find the
95 * file name in the WIN32 namespace corresponding to the matched short file
96 * name. We then convert the name to the current NLS code page, and proceed
97 * searching for a dentry with this name, etc, as in case 2), above.
98 *
99 * Locking: Caller must hold i_sem on the directory.
100 */
101static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
102 struct nameidata *nd)
103{
104 ntfs_volume *vol = NTFS_SB(dir_ino->i_sb);
105 struct inode *dent_inode;
106 ntfschar *uname;
107 ntfs_name *name = NULL;
108 MFT_REF mref;
109 unsigned long dent_ino;
110 int uname_len;
111
112 ntfs_debug("Looking up %s in directory inode 0x%lx.",
113 dent->d_name.name, dir_ino->i_ino);
114 /* Convert the name of the dentry to Unicode. */
115 uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len,
116 &uname);
117 if (uname_len < 0) {
118 ntfs_error(vol->sb, "Failed to convert name to Unicode.");
119 return ERR_PTR(uname_len);
120 }
121 mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len,
122 &name);
123 kmem_cache_free(ntfs_name_cache, uname);
124 if (!IS_ERR_MREF(mref)) {
125 dent_ino = MREF(mref);
126 ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino);
127 dent_inode = ntfs_iget(vol->sb, dent_ino);
128 if (likely(!IS_ERR(dent_inode))) {
129 /* Consistency check. */
130 if (is_bad_inode(dent_inode) || MSEQNO(mref) ==
131 NTFS_I(dent_inode)->seq_no ||
132 dent_ino == FILE_MFT) {
133 /* Perfect WIN32/POSIX match. -- Case 1. */
134 if (!name) {
135 ntfs_debug("Done. (Case 1.)");
136 return d_splice_alias(dent_inode, dent);
137 }
138 /*
139 * We are too indented. Handle imperfect
140 * matches and short file names further below.
141 */
142 goto handle_name;
143 }
144 ntfs_error(vol->sb, "Found stale reference to inode "
145 "0x%lx (reference sequence number = "
146 "0x%x, inode sequence number = 0x%x), "
147 "returning -EIO. Run chkdsk.",
148 dent_ino, MSEQNO(mref),
149 NTFS_I(dent_inode)->seq_no);
150 iput(dent_inode);
151 dent_inode = ERR_PTR(-EIO);
152 } else
153 ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with "
154 "error code %li.", dent_ino,
155 PTR_ERR(dent_inode));
156 if (name)
157 kfree(name);
158 /* Return the error code. */
159 return (struct dentry *)dent_inode;
160 }
161 /* It is guaranteed that name is no longer allocated at this point. */
162 if (MREF_ERR(mref) == -ENOENT) {
163 ntfs_debug("Entry was not found, adding negative dentry.");
164 /* The dcache will handle negative entries. */
165 d_add(dent, NULL);
166 ntfs_debug("Done.");
167 return NULL;
168 }
169 ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error "
170 "code %i.", -MREF_ERR(mref));
171 return ERR_PTR(MREF_ERR(mref));
172
173 // TODO: Consider moving this lot to a separate function! (AIA)
174handle_name:
175 {
176 struct dentry *real_dent, *new_dent;
177 MFT_RECORD *m;
178 ntfs_attr_search_ctx *ctx;
179 ntfs_inode *ni = NTFS_I(dent_inode);
180 int err;
181 struct qstr nls_name;
182
183 nls_name.name = NULL;
184 if (name->type != FILE_NAME_DOS) { /* Case 2. */
185 ntfs_debug("Case 2.");
186 nls_name.len = (unsigned)ntfs_ucstonls(vol,
187 (ntfschar*)&name->name, name->len,
188 (unsigned char**)&nls_name.name, 0);
189 kfree(name);
190 } else /* if (name->type == FILE_NAME_DOS) */ { /* Case 3. */
191 FILE_NAME_ATTR *fn;
192
193 ntfs_debug("Case 3.");
194 kfree(name);
195
196 /* Find the WIN32 name corresponding to the matched DOS name. */
197 ni = NTFS_I(dent_inode);
198 m = map_mft_record(ni);
199 if (IS_ERR(m)) {
200 err = PTR_ERR(m);
201 m = NULL;
202 ctx = NULL;
203 goto err_out;
204 }
205 ctx = ntfs_attr_get_search_ctx(ni, m);
206 if (unlikely(!ctx)) {
207 err = -ENOMEM;
208 goto err_out;
209 }
210 do {
211 ATTR_RECORD *a;
212 u32 val_len;
213
214 err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0,
215 NULL, 0, ctx);
216 if (unlikely(err)) {
217 ntfs_error(vol->sb, "Inode corrupt: No WIN32 "
218 "namespace counterpart to DOS "
219 "file name. Run chkdsk.");
220 if (err == -ENOENT)
221 err = -EIO;
222 goto err_out;
223 }
224 /* Consistency checks. */
225 a = ctx->attr;
226 if (a->non_resident || a->flags)
227 goto eio_err_out;
228 val_len = le32_to_cpu(a->data.resident.value_length);
229 if (le16_to_cpu(a->data.resident.value_offset) +
230 val_len > le32_to_cpu(a->length))
231 goto eio_err_out;
232 fn = (FILE_NAME_ATTR*)((u8*)ctx->attr + le16_to_cpu(
233 ctx->attr->data.resident.value_offset));
234 if ((u32)(fn->file_name_length * sizeof(ntfschar) +
235 sizeof(FILE_NAME_ATTR)) > val_len)
236 goto eio_err_out;
237 } while (fn->file_name_type != FILE_NAME_WIN32);
238
239 /* Convert the found WIN32 name to current NLS code page. */
240 nls_name.len = (unsigned)ntfs_ucstonls(vol,
241 (ntfschar*)&fn->file_name, fn->file_name_length,
242 (unsigned char**)&nls_name.name, 0);
243
244 ntfs_attr_put_search_ctx(ctx);
245 unmap_mft_record(ni);
246 }
247 m = NULL;
248 ctx = NULL;
249
250 /* Check if a conversion error occurred. */
251 if ((signed)nls_name.len < 0) {
252 err = (signed)nls_name.len;
253 goto err_out;
254 }
255 nls_name.hash = full_name_hash(nls_name.name, nls_name.len);
256
257 /*
258 * Note: No need for dent->d_lock lock as i_sem is held on the
259 * parent inode.
260 */
261
262 /* Does a dentry matching the nls_name exist already? */
263 real_dent = d_lookup(dent->d_parent, &nls_name);
264 /* If not, create it now. */
265 if (!real_dent) {
266 real_dent = d_alloc(dent->d_parent, &nls_name);
267 kfree(nls_name.name);
268 if (!real_dent) {
269 err = -ENOMEM;
270 goto err_out;
271 }
272 new_dent = d_splice_alias(dent_inode, real_dent);
273 if (new_dent)
274 dput(real_dent);
275 else
276 new_dent = real_dent;
277 ntfs_debug("Done. (Created new dentry.)");
278 return new_dent;
279 }
280 kfree(nls_name.name);
281 /* Matching dentry exists, check if it is negative. */
282 if (real_dent->d_inode) {
283 if (unlikely(real_dent->d_inode != dent_inode)) {
284 /* This can happen because bad inodes are unhashed. */
285 BUG_ON(!is_bad_inode(dent_inode));
286 BUG_ON(!is_bad_inode(real_dent->d_inode));
287 }
288 /*
289 * Already have the inode and the dentry attached, decrement
290 * the reference count to balance the ntfs_iget() we did
291 * earlier on. We found the dentry using d_lookup() so it
292 * cannot be disconnected and thus we do not need to worry
293 * about any NFS/disconnectedness issues here.
294 */
295 iput(dent_inode);
296 ntfs_debug("Done. (Already had inode and dentry.)");
297 return real_dent;
298 }
299 /*
300 * Negative dentry: instantiate it unless the inode is a directory and
301 * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED),
302 * in which case d_move() that in place of the found dentry.
303 */
304 if (!S_ISDIR(dent_inode->i_mode)) {
305 /* Not a directory; everything is easy. */
306 d_instantiate(real_dent, dent_inode);
307 ntfs_debug("Done. (Already had negative file dentry.)");
308 return real_dent;
309 }
310 spin_lock(&dcache_lock);
311 if (list_empty(&dent_inode->i_dentry)) {
312 /*
313 * Directory without a 'disconnected' dentry; we need to do
314 * d_instantiate() by hand because it takes dcache_lock which
315 * we already hold.
316 */
317 list_add(&real_dent->d_alias, &dent_inode->i_dentry);
318 real_dent->d_inode = dent_inode;
319 spin_unlock(&dcache_lock);
320 security_d_instantiate(real_dent, dent_inode);
321 ntfs_debug("Done. (Already had negative directory dentry.)");
322 return real_dent;
323 }
324 /*
325 * Directory with a 'disconnected' dentry; get a reference to the
326 * 'disconnected' dentry.
327 */
328 new_dent = list_entry(dent_inode->i_dentry.next, struct dentry,
329 d_alias);
330 dget_locked(new_dent);
331 spin_unlock(&dcache_lock);
332 /* Do security vodoo. */
333 security_d_instantiate(real_dent, dent_inode);
334 /* Move new_dent in place of real_dent. */
335 d_move(new_dent, real_dent);
336 /* Balance the ntfs_iget() we did above. */
337 iput(dent_inode);
338 /* Throw away real_dent. */
339 dput(real_dent);
340 /* Use new_dent as the actual dentry. */
341 ntfs_debug("Done. (Already had negative, disconnected directory "
342 "dentry.)");
343 return new_dent;
344
345eio_err_out:
346 ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk.");
347 err = -EIO;
348err_out:
349 if (ctx)
350 ntfs_attr_put_search_ctx(ctx);
351 if (m)
352 unmap_mft_record(ni);
353 iput(dent_inode);
354 ntfs_error(vol->sb, "Failed, returning error code %i.", err);
355 return ERR_PTR(err);
356 }
357}
358
359/**
360 * Inode operations for directories.
361 */
362struct inode_operations ntfs_dir_inode_ops = {
363 .lookup = ntfs_lookup, /* VFS: Lookup directory. */
364};
365
366/**
367 * ntfs_get_parent - find the dentry of the parent of a given directory dentry
368 * @child_dent: dentry of the directory whose parent directory to find
369 *
370 * Find the dentry for the parent directory of the directory specified by the
371 * dentry @child_dent. This function is called from
372 * fs/exportfs/expfs.c::find_exported_dentry() which in turn is called from the
373 * default ->decode_fh() which is export_decode_fh() in the same file.
374 *
375 * The code is based on the ext3 ->get_parent() implementation found in
376 * fs/ext3/namei.c::ext3_get_parent().
377 *
378 * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_sem down.
379 *
380 * Return the dentry of the parent directory on success or the error code on
381 * error (IS_ERR() is true).
382 */
383struct dentry *ntfs_get_parent(struct dentry *child_dent)
384{
385 struct inode *vi = child_dent->d_inode;
386 ntfs_inode *ni = NTFS_I(vi);
387 MFT_RECORD *mrec;
388 ntfs_attr_search_ctx *ctx;
389 ATTR_RECORD *attr;
390 FILE_NAME_ATTR *fn;
391 struct inode *parent_vi;
392 struct dentry *parent_dent;
393 unsigned long parent_ino;
394 int err;
395
396 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
397 /* Get the mft record of the inode belonging to the child dentry. */
398 mrec = map_mft_record(ni);
399 if (IS_ERR(mrec))
400 return (struct dentry *)mrec;
401 /* Find the first file name attribute in the mft record. */
402 ctx = ntfs_attr_get_search_ctx(ni, mrec);
403 if (unlikely(!ctx)) {
404 unmap_mft_record(ni);
405 return ERR_PTR(-ENOMEM);
406 }
407try_next:
408 err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, CASE_SENSITIVE, 0, NULL,
409 0, ctx);
410 if (unlikely(err)) {
411 ntfs_attr_put_search_ctx(ctx);
412 unmap_mft_record(ni);
413 if (err == -ENOENT)
414 ntfs_error(vi->i_sb, "Inode 0x%lx does not have a "
415 "file name attribute. Run chkdsk.",
416 vi->i_ino);
417 return ERR_PTR(err);
418 }
419 attr = ctx->attr;
420 if (unlikely(attr->non_resident))
421 goto try_next;
422 fn = (FILE_NAME_ATTR *)((u8 *)attr +
423 le16_to_cpu(attr->data.resident.value_offset));
424 if (unlikely((u8 *)fn + le32_to_cpu(attr->data.resident.value_length) >
425 (u8*)attr + le32_to_cpu(attr->length)))
426 goto try_next;
427 /* Get the inode number of the parent directory. */
428 parent_ino = MREF_LE(fn->parent_directory);
429 /* Release the search context and the mft record of the child. */
430 ntfs_attr_put_search_ctx(ctx);
431 unmap_mft_record(ni);
432 /* Get the inode of the parent directory. */
433 parent_vi = ntfs_iget(vi->i_sb, parent_ino);
434 if (IS_ERR(parent_vi) || unlikely(is_bad_inode(parent_vi))) {
435 if (!IS_ERR(parent_vi))
436 iput(parent_vi);
437 ntfs_error(vi->i_sb, "Failed to get parent directory inode "
438 "0x%lx of child inode 0x%lx.", parent_ino,
439 vi->i_ino);
440 return ERR_PTR(-EACCES);
441 }
442 /* Finally get a dentry for the parent directory and return it. */
443 parent_dent = d_alloc_anon(parent_vi);
444 if (unlikely(!parent_dent)) {
445 iput(parent_vi);
446 return ERR_PTR(-ENOMEM);
447 }
448 ntfs_debug("Done for inode 0x%lx.", vi->i_ino);
449 return parent_dent;
450}
451
452/**
453 * ntfs_get_dentry - find a dentry for the inode from a file handle sub-fragment
454 * @sb: super block identifying the mounted ntfs volume
455 * @fh: the file handle sub-fragment
456 *
457 * Find a dentry for the inode given a file handle sub-fragment. This function
458 * is called from fs/exportfs/expfs.c::find_exported_dentry() which in turn is
459 * called from the default ->decode_fh() which is export_decode_fh() in the
460 * same file. The code is closely based on the default ->get_dentry() helper
461 * fs/exportfs/expfs.c::get_object().
462 *
463 * The @fh contains two 32-bit unsigned values, the first one is the inode
464 * number and the second one is the inode generation.
465 *
466 * Return the dentry on success or the error code on error (IS_ERR() is true).
467 */
468struct dentry *ntfs_get_dentry(struct super_block *sb, void *fh)
469{
470 struct inode *vi;
471 struct dentry *dent;
472 unsigned long ino = ((u32 *)fh)[0];
473 u32 gen = ((u32 *)fh)[1];
474
475 ntfs_debug("Entering for inode 0x%lx, generation 0x%x.", ino, gen);
476 vi = ntfs_iget(sb, ino);
477 if (IS_ERR(vi)) {
478 ntfs_error(sb, "Failed to get inode 0x%lx.", ino);
479 return (struct dentry *)vi;
480 }
481 if (unlikely(is_bad_inode(vi) || vi->i_generation != gen)) {
482 /* We didn't find the right inode. */
483 ntfs_error(sb, "Inode 0x%lx, bad count: %d %d or version 0x%x "
484 "0x%x.", vi->i_ino, vi->i_nlink,
485 atomic_read(&vi->i_count), vi->i_generation,
486 gen);
487 iput(vi);
488 return ERR_PTR(-ESTALE);
489 }
490 /* Now find a dentry. If possible, get a well-connected one. */
491 dent = d_alloc_anon(vi);
492 if (unlikely(!dent)) {
493 iput(vi);
494 return ERR_PTR(-ENOMEM);
495 }
496 ntfs_debug("Done for inode 0x%lx, generation 0x%x.", ino, gen);
497 return dent;
498}
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
new file mode 100644
index 000000000000..720ffb71bab8
--- /dev/null
+++ b/fs/ntfs/ntfs.h
@@ -0,0 +1,129 @@
1/*
2 * ntfs.h - Defines for NTFS Linux kernel driver. Part of the Linux-NTFS
3 * project.
4 *
5 * Copyright (c) 2001-2004 Anton Altaparmakov
6 * Copyright (C) 2002 Richard Russon
7 *
8 * This program/include file is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as published
10 * by the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program/include file is distributed in the hope that it will be
14 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program (in the main directory of the Linux-NTFS
20 * distribution in the file COPYING); if not, write to the Free Software
21 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24#ifndef _LINUX_NTFS_H
25#define _LINUX_NTFS_H
26
27#include <linux/stddef.h>
28#include <linux/kernel.h>
29#include <linux/module.h>
30#include <linux/compiler.h>
31#include <linux/fs.h>
32#include <linux/nls.h>
33#include <linux/smp.h>
34
35#include "types.h"
36#include "volume.h"
37#include "layout.h"
38
39typedef enum {
40 NTFS_BLOCK_SIZE = 512,
41 NTFS_BLOCK_SIZE_BITS = 9,
42 NTFS_SB_MAGIC = 0x5346544e, /* 'NTFS' */
43 NTFS_MAX_NAME_LEN = 255,
44} NTFS_CONSTANTS;
45
46/* Global variables. */
47
48/* Slab caches (from super.c). */
49extern kmem_cache_t *ntfs_name_cache;
50extern kmem_cache_t *ntfs_inode_cache;
51extern kmem_cache_t *ntfs_big_inode_cache;
52extern kmem_cache_t *ntfs_attr_ctx_cache;
53extern kmem_cache_t *ntfs_index_ctx_cache;
54
55/* The various operations structs defined throughout the driver files. */
56extern struct address_space_operations ntfs_aops;
57extern struct address_space_operations ntfs_mst_aops;
58
59extern struct file_operations ntfs_file_ops;
60extern struct inode_operations ntfs_file_inode_ops;
61
62extern struct file_operations ntfs_dir_ops;
63extern struct inode_operations ntfs_dir_inode_ops;
64
65extern struct file_operations ntfs_empty_file_ops;
66extern struct inode_operations ntfs_empty_inode_ops;
67
68/**
69 * NTFS_SB - return the ntfs volume given a vfs super block
70 * @sb: VFS super block
71 *
72 * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb.
73 */
74static inline ntfs_volume *NTFS_SB(struct super_block *sb)
75{
76 return sb->s_fs_info;
77}
78
79/* Declarations of functions and global variables. */
80
81/* From fs/ntfs/compress.c */
82extern int ntfs_read_compressed_block(struct page *page);
83extern int allocate_compression_buffers(void);
84extern void free_compression_buffers(void);
85
86/* From fs/ntfs/super.c */
87#define default_upcase_len 0x10000
88extern struct semaphore ntfs_lock;
89
90typedef struct {
91 int val;
92 char *str;
93} option_t;
94extern const option_t on_errors_arr[];
95
96/* From fs/ntfs/mst.c */
97extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size);
98extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size);
99extern void post_write_mst_fixup(NTFS_RECORD *b);
100
101/* From fs/ntfs/unistr.c */
102extern BOOL ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
103 const ntfschar *s2, size_t s2_len,
104 const IGNORE_CASE_BOOL ic,
105 const ntfschar *upcase, const u32 upcase_size);
106extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
107 const ntfschar *name2, const u32 name2_len,
108 const int err_val, const IGNORE_CASE_BOOL ic,
109 const ntfschar *upcase, const u32 upcase_len);
110extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n);
111extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
112 const ntfschar *upcase, const u32 upcase_size);
113extern void ntfs_upcase_name(ntfschar *name, u32 name_len,
114 const ntfschar *upcase, const u32 upcase_len);
115extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
116 const ntfschar *upcase, const u32 upcase_len);
117extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
118 FILE_NAME_ATTR *file_name_attr2,
119 const int err_val, const IGNORE_CASE_BOOL ic,
120 const ntfschar *upcase, const u32 upcase_len);
121extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
122 const int ins_len, ntfschar **outs);
123extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
124 const int ins_len, unsigned char **outs, int outs_len);
125
126/* From fs/ntfs/upcase.c */
127extern ntfschar *generate_default_upcase(void);
128
129#endif /* _LINUX_NTFS_H */
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
new file mode 100644
index 000000000000..833df2a4e9fb
--- /dev/null
+++ b/fs/ntfs/quota.c
@@ -0,0 +1,117 @@
1/*
2 * quota.c - NTFS kernel quota ($Quota) handling. Part of the Linux-NTFS
3 * project.
4 *
5 * Copyright (c) 2004 Anton Altaparmakov
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#ifdef NTFS_RW
24
25#include "index.h"
26#include "quota.h"
27#include "debug.h"
28#include "ntfs.h"
29
30/**
31 * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume
32 * @vol: ntfs volume on which to mark the quotas out of date
33 *
34 * Mark the quotas out of date on the ntfs volume @vol and return TRUE on
35 * success and FALSE on error.
36 */
37BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
38{
39 ntfs_index_context *ictx;
40 QUOTA_CONTROL_ENTRY *qce;
41 const le32 qid = QUOTA_DEFAULTS_ID;
42 int err;
43
44 ntfs_debug("Entering.");
45 if (NVolQuotaOutOfDate(vol))
46 goto done;
47 if (!vol->quota_ino || !vol->quota_q_ino) {
48 ntfs_error(vol->sb, "Quota inodes are not open.");
49 return FALSE;
50 }
51 down(&vol->quota_q_ino->i_sem);
52 ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
53 if (!ictx) {
54 ntfs_error(vol->sb, "Failed to get index context.");
55 goto err_out;
56 }
57 err = ntfs_index_lookup(&qid, sizeof(qid), ictx);
58 if (err) {
59 if (err == -ENOENT)
60 ntfs_error(vol->sb, "Quota defaults entry is not "
61 "present.");
62 else
63 ntfs_error(vol->sb, "Lookup of quota defaults entry "
64 "failed.");
65 goto err_out;
66 }
67 if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) {
68 ntfs_error(vol->sb, "Quota defaults entry size is invalid. "
69 "Run chkdsk.");
70 goto err_out;
71 }
72 qce = (QUOTA_CONTROL_ENTRY*)ictx->data;
73 if (le32_to_cpu(qce->version) != QUOTA_VERSION) {
74 ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not "
75 "supported.", le32_to_cpu(qce->version));
76 goto err_out;
77 }
78 ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags));
79 /* If quotas are already marked out of date, no need to do anything. */
80 if (qce->flags & QUOTA_FLAG_OUT_OF_DATE)
81 goto set_done;
82 /*
83 * If quota tracking is neither requested, nor enabled and there are no
84 * pending deletes, no need to mark the quotas out of date.
85 */
86 if (!(qce->flags & (QUOTA_FLAG_TRACKING_ENABLED |
87 QUOTA_FLAG_TRACKING_REQUESTED |
88 QUOTA_FLAG_PENDING_DELETES)))
89 goto set_done;
90 /*
91 * Set the QUOTA_FLAG_OUT_OF_DATE bit thus marking quotas out of date.
92 * This is verified on WinXP to be sufficient to cause windows to
93 * rescan the volume on boot and update all quota entries.
94 */
95 qce->flags |= QUOTA_FLAG_OUT_OF_DATE;
96 /* Ensure the modified flags are written to disk. */
97 ntfs_index_entry_flush_dcache_page(ictx);
98 ntfs_index_entry_mark_dirty(ictx);
99set_done:
100 ntfs_index_ctx_put(ictx);
101 up(&vol->quota_q_ino->i_sem);
102 /*
103 * We set the flag so we do not try to mark the quotas out of date
104 * again on remount.
105 */
106 NVolSetQuotaOutOfDate(vol);
107done:
108 ntfs_debug("Done.");
109 return TRUE;
110err_out:
111 if (ictx)
112 ntfs_index_ctx_put(ictx);
113 up(&vol->quota_q_ino->i_sem);
114 return FALSE;
115}
116
117#endif /* NTFS_RW */
diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h
new file mode 100644
index 000000000000..40e4763aa222
--- /dev/null
+++ b/fs/ntfs/quota.h
@@ -0,0 +1,35 @@
1/*
2 * quota.h - Defines for NTFS kernel quota ($Quota) handling. Part of the
3 * Linux-NTFS project.
4 *
5 * Copyright (c) 2004 Anton Altaparmakov
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#ifndef _LINUX_NTFS_QUOTA_H
24#define _LINUX_NTFS_QUOTA_H
25
26#ifdef NTFS_RW
27
28#include "types.h"
29#include "volume.h"
30
31extern BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol);
32
33#endif /* NTFS_RW */
34
35#endif /* _LINUX_NTFS_QUOTA_H */
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
new file mode 100644
index 000000000000..8438fb1da219
--- /dev/null
+++ b/fs/ntfs/runlist.c
@@ -0,0 +1,1438 @@
1/**
2 * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2001-2004 Anton Altaparmakov
5 * Copyright (c) 2002 Richard Russon
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#include "debug.h"
24#include "dir.h"
25#include "endian.h"
26#include "malloc.h"
27#include "ntfs.h"
28
29/**
30 * ntfs_rl_mm - runlist memmove
31 *
32 * It is up to the caller to serialize access to the runlist @base.
33 */
34static inline void ntfs_rl_mm(runlist_element *base, int dst, int src,
35 int size)
36{
37 if (likely((dst != src) && (size > 0)))
38 memmove(base + dst, base + src, size * sizeof (*base));
39}
40
41/**
42 * ntfs_rl_mc - runlist memory copy
43 *
44 * It is up to the caller to serialize access to the runlists @dstbase and
45 * @srcbase.
46 */
47static inline void ntfs_rl_mc(runlist_element *dstbase, int dst,
48 runlist_element *srcbase, int src, int size)
49{
50 if (likely(size > 0))
51 memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase));
52}
53
54/**
55 * ntfs_rl_realloc - Reallocate memory for runlists
56 * @rl: original runlist
57 * @old_size: number of runlist elements in the original runlist @rl
58 * @new_size: number of runlist elements we need space for
59 *
60 * As the runlists grow, more memory will be required. To prevent the
61 * kernel having to allocate and reallocate large numbers of small bits of
62 * memory, this function returns and entire page of memory.
63 *
64 * It is up to the caller to serialize access to the runlist @rl.
65 *
66 * N.B. If the new allocation doesn't require a different number of pages in
67 * memory, the function will return the original pointer.
68 *
69 * On success, return a pointer to the newly allocated, or recycled, memory.
70 * On error, return -errno. The following error codes are defined:
71 * -ENOMEM - Not enough memory to allocate runlist array.
72 * -EINVAL - Invalid parameters were passed in.
73 */
74static inline runlist_element *ntfs_rl_realloc(runlist_element *rl,
75 int old_size, int new_size)
76{
77 runlist_element *new_rl;
78
79 old_size = PAGE_ALIGN(old_size * sizeof(*rl));
80 new_size = PAGE_ALIGN(new_size * sizeof(*rl));
81 if (old_size == new_size)
82 return rl;
83
84 new_rl = ntfs_malloc_nofs(new_size);
85 if (unlikely(!new_rl))
86 return ERR_PTR(-ENOMEM);
87
88 if (likely(rl != NULL)) {
89 if (unlikely(old_size > new_size))
90 old_size = new_size;
91 memcpy(new_rl, rl, old_size);
92 ntfs_free(rl);
93 }
94 return new_rl;
95}
96
97/**
98 * ntfs_are_rl_mergeable - test if two runlists can be joined together
99 * @dst: original runlist
100 * @src: new runlist to test for mergeability with @dst
101 *
102 * Test if two runlists can be joined together. For this, their VCNs and LCNs
103 * must be adjacent.
104 *
105 * It is up to the caller to serialize access to the runlists @dst and @src.
106 *
107 * Return: TRUE Success, the runlists can be merged.
108 * FALSE Failure, the runlists cannot be merged.
109 */
110static inline BOOL ntfs_are_rl_mergeable(runlist_element *dst,
111 runlist_element *src)
112{
113 BUG_ON(!dst);
114 BUG_ON(!src);
115
116 if ((dst->lcn < 0) || (src->lcn < 0)) /* Are we merging holes? */
117 return FALSE;
118 if ((dst->lcn + dst->length) != src->lcn) /* Are the runs contiguous? */
119 return FALSE;
120 if ((dst->vcn + dst->length) != src->vcn) /* Are the runs misaligned? */
121 return FALSE;
122
123 return TRUE;
124}
125
126/**
127 * __ntfs_rl_merge - merge two runlists without testing if they can be merged
128 * @dst: original, destination runlist
129 * @src: new runlist to merge with @dst
130 *
131 * Merge the two runlists, writing into the destination runlist @dst. The
132 * caller must make sure the runlists can be merged or this will corrupt the
133 * destination runlist.
134 *
135 * It is up to the caller to serialize access to the runlists @dst and @src.
136 */
137static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src)
138{
139 dst->length += src->length;
140}
141
142/**
143 * ntfs_rl_append - append a runlist after a given element
144 * @dst: original runlist to be worked on
145 * @dsize: number of elements in @dst (including end marker)
146 * @src: runlist to be inserted into @dst
147 * @ssize: number of elements in @src (excluding end marker)
148 * @loc: append the new runlist @src after this element in @dst
149 *
150 * Append the runlist @src after element @loc in @dst. Merge the right end of
151 * the new runlist, if necessary. Adjust the size of the hole before the
152 * appended runlist.
153 *
154 * It is up to the caller to serialize access to the runlists @dst and @src.
155 *
156 * On success, return a pointer to the new, combined, runlist. Note, both
157 * runlists @dst and @src are deallocated before returning so you cannot use
158 * the pointers for anything any more. (Strictly speaking the returned runlist
159 * may be the same as @dst but this is irrelevant.)
160 *
161 * On error, return -errno. Both runlists are left unmodified. The following
162 * error codes are defined:
163 * -ENOMEM - Not enough memory to allocate runlist array.
164 * -EINVAL - Invalid parameters were passed in.
165 */
166static inline runlist_element *ntfs_rl_append(runlist_element *dst,
167 int dsize, runlist_element *src, int ssize, int loc)
168{
169 BOOL right;
170 int magic;
171
172 BUG_ON(!dst);
173 BUG_ON(!src);
174
175 /* First, check if the right hand end needs merging. */
176 right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
177
178 /* Space required: @dst size + @src size, less one if we merged. */
179 dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right);
180 if (IS_ERR(dst))
181 return dst;
182 /*
183 * We are guaranteed to succeed from here so can start modifying the
184 * original runlists.
185 */
186
187 /* First, merge the right hand end, if necessary. */
188 if (right)
189 __ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
190
191 magic = loc + ssize;
192
193 /* Move the tail of @dst out of the way, then copy in @src. */
194 ntfs_rl_mm(dst, magic + 1, loc + 1 + right, dsize - loc - 1 - right);
195 ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
196
197 /* Adjust the size of the preceding hole. */
198 dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
199
200 /* We may have changed the length of the file, so fix the end marker */
201 if (dst[magic + 1].lcn == LCN_ENOENT)
202 dst[magic + 1].vcn = dst[magic].vcn + dst[magic].length;
203
204 return dst;
205}
206
207/**
208 * ntfs_rl_insert - insert a runlist into another
209 * @dst: original runlist to be worked on
210 * @dsize: number of elements in @dst (including end marker)
211 * @src: new runlist to be inserted
212 * @ssize: number of elements in @src (excluding end marker)
213 * @loc: insert the new runlist @src before this element in @dst
214 *
215 * Insert the runlist @src before element @loc in the runlist @dst. Merge the
216 * left end of the new runlist, if necessary. Adjust the size of the hole
217 * after the inserted runlist.
218 *
219 * It is up to the caller to serialize access to the runlists @dst and @src.
220 *
221 * On success, return a pointer to the new, combined, runlist. Note, both
222 * runlists @dst and @src are deallocated before returning so you cannot use
223 * the pointers for anything any more. (Strictly speaking the returned runlist
224 * may be the same as @dst but this is irrelevant.)
225 *
226 * On error, return -errno. Both runlists are left unmodified. The following
227 * error codes are defined:
228 * -ENOMEM - Not enough memory to allocate runlist array.
229 * -EINVAL - Invalid parameters were passed in.
230 */
231static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
232 int dsize, runlist_element *src, int ssize, int loc)
233{
234 BOOL left = FALSE;
235 BOOL disc = FALSE; /* Discontinuity */
236 BOOL hole = FALSE; /* Following a hole */
237 int magic;
238
239 BUG_ON(!dst);
240 BUG_ON(!src);
241
242 /* disc => Discontinuity between the end of @dst and the start of @src.
243 * This means we might need to insert a hole.
244 * hole => @dst ends with a hole or an unmapped region which we can
245 * extend to match the discontinuity. */
246 if (loc == 0)
247 disc = (src[0].vcn > 0);
248 else {
249 s64 merged_length;
250
251 left = ntfs_are_rl_mergeable(dst + loc - 1, src);
252
253 merged_length = dst[loc - 1].length;
254 if (left)
255 merged_length += src->length;
256
257 disc = (src[0].vcn > dst[loc - 1].vcn + merged_length);
258 if (disc)
259 hole = (dst[loc - 1].lcn == LCN_HOLE);
260 }
261
262 /* Space required: @dst size + @src size, less one if we merged, plus
263 * one if there was a discontinuity, less one for a trailing hole. */
264 dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc - hole);
265 if (IS_ERR(dst))
266 return dst;
267 /*
268 * We are guaranteed to succeed from here so can start modifying the
269 * original runlist.
270 */
271
272 if (left)
273 __ntfs_rl_merge(dst + loc - 1, src);
274
275 magic = loc + ssize - left + disc - hole;
276
277 /* Move the tail of @dst out of the way, then copy in @src. */
278 ntfs_rl_mm(dst, magic, loc, dsize - loc);
279 ntfs_rl_mc(dst, loc + disc - hole, src, left, ssize - left);
280
281 /* Adjust the VCN of the last run ... */
282 if (dst[magic].lcn <= LCN_HOLE)
283 dst[magic].vcn = dst[magic - 1].vcn + dst[magic - 1].length;
284 /* ... and the length. */
285 if (dst[magic].lcn == LCN_HOLE || dst[magic].lcn == LCN_RL_NOT_MAPPED)
286 dst[magic].length = dst[magic + 1].vcn - dst[magic].vcn;
287
288 /* Writing beyond the end of the file and there's a discontinuity. */
289 if (disc) {
290 if (hole)
291 dst[loc - 1].length = dst[loc].vcn - dst[loc - 1].vcn;
292 else {
293 if (loc > 0) {
294 dst[loc].vcn = dst[loc - 1].vcn +
295 dst[loc - 1].length;
296 dst[loc].length = dst[loc + 1].vcn -
297 dst[loc].vcn;
298 } else {
299 dst[loc].vcn = 0;
300 dst[loc].length = dst[loc + 1].vcn;
301 }
302 dst[loc].lcn = LCN_RL_NOT_MAPPED;
303 }
304
305 magic += hole;
306
307 if (dst[magic].lcn == LCN_ENOENT)
308 dst[magic].vcn = dst[magic - 1].vcn +
309 dst[magic - 1].length;
310 }
311 return dst;
312}
313
314/**
315 * ntfs_rl_replace - overwrite a runlist element with another runlist
316 * @dst: original runlist to be worked on
317 * @dsize: number of elements in @dst (including end marker)
318 * @src: new runlist to be inserted
319 * @ssize: number of elements in @src (excluding end marker)
320 * @loc: index in runlist @dst to overwrite with @src
321 *
322 * Replace the runlist element @dst at @loc with @src. Merge the left and
323 * right ends of the inserted runlist, if necessary.
324 *
325 * It is up to the caller to serialize access to the runlists @dst and @src.
326 *
327 * On success, return a pointer to the new, combined, runlist. Note, both
328 * runlists @dst and @src are deallocated before returning so you cannot use
329 * the pointers for anything any more. (Strictly speaking the returned runlist
330 * may be the same as @dst but this is irrelevant.)
331 *
332 * On error, return -errno. Both runlists are left unmodified. The following
333 * error codes are defined:
334 * -ENOMEM - Not enough memory to allocate runlist array.
335 * -EINVAL - Invalid parameters were passed in.
336 */
337static inline runlist_element *ntfs_rl_replace(runlist_element *dst,
338 int dsize, runlist_element *src, int ssize, int loc)
339{
340 BOOL left = FALSE;
341 BOOL right;
342 int magic;
343
344 BUG_ON(!dst);
345 BUG_ON(!src);
346
347 /* First, merge the left and right ends, if necessary. */
348 right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
349 if (loc > 0)
350 left = ntfs_are_rl_mergeable(dst + loc - 1, src);
351
352 /* Allocate some space. We'll need less if the left, right, or both
353 * ends were merged. */
354 dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left - right);
355 if (IS_ERR(dst))
356 return dst;
357 /*
358 * We are guaranteed to succeed from here so can start modifying the
359 * original runlists.
360 */
361 if (right)
362 __ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
363 if (left)
364 __ntfs_rl_merge(dst + loc - 1, src);
365
366 /* FIXME: What does this mean? (AIA) */
367 magic = loc + ssize - left;
368
369 /* Move the tail of @dst out of the way, then copy in @src. */
370 ntfs_rl_mm(dst, magic, loc + right + 1, dsize - loc - right - 1);
371 ntfs_rl_mc(dst, loc, src, left, ssize - left);
372
373 /* We may have changed the length of the file, so fix the end marker */
374 if (dst[magic].lcn == LCN_ENOENT)
375 dst[magic].vcn = dst[magic - 1].vcn + dst[magic - 1].length;
376 return dst;
377}
378
379/**
380 * ntfs_rl_split - insert a runlist into the centre of a hole
381 * @dst: original runlist to be worked on
382 * @dsize: number of elements in @dst (including end marker)
383 * @src: new runlist to be inserted
384 * @ssize: number of elements in @src (excluding end marker)
385 * @loc: index in runlist @dst at which to split and insert @src
386 *
387 * Split the runlist @dst at @loc into two and insert @new in between the two
388 * fragments. No merging of runlists is necessary. Adjust the size of the
389 * holes either side.
390 *
391 * It is up to the caller to serialize access to the runlists @dst and @src.
392 *
393 * On success, return a pointer to the new, combined, runlist. Note, both
394 * runlists @dst and @src are deallocated before returning so you cannot use
395 * the pointers for anything any more. (Strictly speaking the returned runlist
396 * may be the same as @dst but this is irrelevant.)
397 *
398 * On error, return -errno. Both runlists are left unmodified. The following
399 * error codes are defined:
400 * -ENOMEM - Not enough memory to allocate runlist array.
401 * -EINVAL - Invalid parameters were passed in.
402 */
403static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize,
404 runlist_element *src, int ssize, int loc)
405{
406 BUG_ON(!dst);
407 BUG_ON(!src);
408
409 /* Space required: @dst size + @src size + one new hole. */
410 dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1);
411 if (IS_ERR(dst))
412 return dst;
413 /*
414 * We are guaranteed to succeed from here so can start modifying the
415 * original runlists.
416 */
417
418 /* Move the tail of @dst out of the way, then copy in @src. */
419 ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc);
420 ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
421
422 /* Adjust the size of the holes either size of @src. */
423 dst[loc].length = dst[loc+1].vcn - dst[loc].vcn;
424 dst[loc+ssize+1].vcn = dst[loc+ssize].vcn + dst[loc+ssize].length;
425 dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn;
426
427 return dst;
428}
429
430/**
431 * ntfs_runlists_merge - merge two runlists into one
432 * @drl: original runlist to be worked on
433 * @srl: new runlist to be merged into @drl
434 *
435 * First we sanity check the two runlists @srl and @drl to make sure that they
436 * are sensible and can be merged. The runlist @srl must be either after the
437 * runlist @drl or completely within a hole (or unmapped region) in @drl.
438 *
439 * It is up to the caller to serialize access to the runlists @drl and @srl.
440 *
441 * Merging of runlists is necessary in two cases:
442 * 1. When attribute lists are used and a further extent is being mapped.
443 * 2. When new clusters are allocated to fill a hole or extend a file.
444 *
445 * There are four possible ways @srl can be merged. It can:
446 * - be inserted at the beginning of a hole,
447 * - split the hole in two and be inserted between the two fragments,
448 * - be appended at the end of a hole, or it can
449 * - replace the whole hole.
450 * It can also be appended to the end of the runlist, which is just a variant
451 * of the insert case.
452 *
453 * On success, return a pointer to the new, combined, runlist. Note, both
454 * runlists @drl and @srl are deallocated before returning so you cannot use
455 * the pointers for anything any more. (Strictly speaking the returned runlist
456 * may be the same as @dst but this is irrelevant.)
457 *
458 * On error, return -errno. Both runlists are left unmodified. The following
459 * error codes are defined:
460 * -ENOMEM - Not enough memory to allocate runlist array.
461 * -EINVAL - Invalid parameters were passed in.
462 * -ERANGE - The runlists overlap and cannot be merged.
463 */
464runlist_element *ntfs_runlists_merge(runlist_element *drl,
465 runlist_element *srl)
466{
467 int di, si; /* Current index into @[ds]rl. */
468 int sstart; /* First index with lcn > LCN_RL_NOT_MAPPED. */
469 int dins; /* Index into @drl at which to insert @srl. */
470 int dend, send; /* Last index into @[ds]rl. */
471 int dfinal, sfinal; /* The last index into @[ds]rl with
472 lcn >= LCN_HOLE. */
473 int marker = 0;
474 VCN marker_vcn = 0;
475
476#ifdef DEBUG
477 ntfs_debug("dst:");
478 ntfs_debug_dump_runlist(drl);
479 ntfs_debug("src:");
480 ntfs_debug_dump_runlist(srl);
481#endif
482
483 /* Check for silly calling... */
484 if (unlikely(!srl))
485 return drl;
486 if (IS_ERR(srl) || IS_ERR(drl))
487 return ERR_PTR(-EINVAL);
488
489 /* Check for the case where the first mapping is being done now. */
490 if (unlikely(!drl)) {
491 drl = srl;
492 /* Complete the source runlist if necessary. */
493 if (unlikely(drl[0].vcn)) {
494 /* Scan to the end of the source runlist. */
495 for (dend = 0; likely(drl[dend].length); dend++)
496 ;
497 drl = ntfs_rl_realloc(drl, dend, dend + 1);
498 if (IS_ERR(drl))
499 return drl;
500 /* Insert start element at the front of the runlist. */
501 ntfs_rl_mm(drl, 1, 0, dend);
502 drl[0].vcn = 0;
503 drl[0].lcn = LCN_RL_NOT_MAPPED;
504 drl[0].length = drl[1].vcn;
505 }
506 goto finished;
507 }
508
509 si = di = 0;
510
511 /* Skip any unmapped start element(s) in the source runlist. */
512 while (srl[si].length && srl[si].lcn < LCN_HOLE)
513 si++;
514
515 /* Can't have an entirely unmapped source runlist. */
516 BUG_ON(!srl[si].length);
517
518 /* Record the starting points. */
519 sstart = si;
520
521 /*
522 * Skip forward in @drl until we reach the position where @srl needs to
523 * be inserted. If we reach the end of @drl, @srl just needs to be
524 * appended to @drl.
525 */
526 for (; drl[di].length; di++) {
527 if (drl[di].vcn + drl[di].length > srl[sstart].vcn)
528 break;
529 }
530 dins = di;
531
532 /* Sanity check for illegal overlaps. */
533 if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) &&
534 (srl[si].lcn >= 0)) {
535 ntfs_error(NULL, "Run lists overlap. Cannot merge!");
536 return ERR_PTR(-ERANGE);
537 }
538
539 /* Scan to the end of both runlists in order to know their sizes. */
540 for (send = si; srl[send].length; send++)
541 ;
542 for (dend = di; drl[dend].length; dend++)
543 ;
544
545 if (srl[send].lcn == LCN_ENOENT)
546 marker_vcn = srl[marker = send].vcn;
547
548 /* Scan to the last element with lcn >= LCN_HOLE. */
549 for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--)
550 ;
551 for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--)
552 ;
553
554 {
555 BOOL start;
556 BOOL finish;
557 int ds = dend + 1; /* Number of elements in drl & srl */
558 int ss = sfinal - sstart + 1;
559
560 start = ((drl[dins].lcn < LCN_RL_NOT_MAPPED) || /* End of file */
561 (drl[dins].vcn == srl[sstart].vcn)); /* Start of hole */
562 finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) && /* End of file */
563 ((drl[dins].vcn + drl[dins].length) <= /* End of hole */
564 (srl[send - 1].vcn + srl[send - 1].length)));
565
566 /* Or we'll lose an end marker */
567 if (start && finish && (drl[dins].length == 0))
568 ss++;
569 if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn))
570 finish = FALSE;
571#if 0
572 ntfs_debug("dfinal = %i, dend = %i", dfinal, dend);
573 ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send);
574 ntfs_debug("start = %i, finish = %i", start, finish);
575 ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins);
576#endif
577 if (start) {
578 if (finish)
579 drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins);
580 else
581 drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins);
582 } else {
583 if (finish)
584 drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins);
585 else
586 drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins);
587 }
588 if (IS_ERR(drl)) {
589 ntfs_error(NULL, "Merge failed.");
590 return drl;
591 }
592 ntfs_free(srl);
593 if (marker) {
594 ntfs_debug("Triggering marker code.");
595 for (ds = dend; drl[ds].length; ds++)
596 ;
597 /* We only need to care if @srl ended after @drl. */
598 if (drl[ds].vcn <= marker_vcn) {
599 int slots = 0;
600
601 if (drl[ds].vcn == marker_vcn) {
602 ntfs_debug("Old marker = 0x%llx, replacing "
603 "with LCN_ENOENT.",
604 (unsigned long long)
605 drl[ds].lcn);
606 drl[ds].lcn = LCN_ENOENT;
607 goto finished;
608 }
609 /*
610 * We need to create an unmapped runlist element in
611 * @drl or extend an existing one before adding the
612 * ENOENT terminator.
613 */
614 if (drl[ds].lcn == LCN_ENOENT) {
615 ds--;
616 slots = 1;
617 }
618 if (drl[ds].lcn != LCN_RL_NOT_MAPPED) {
619 /* Add an unmapped runlist element. */
620 if (!slots) {
621 /* FIXME/TODO: We need to have the
622 * extra memory already! (AIA) */
623 drl = ntfs_rl_realloc(drl, ds, ds + 2);
624 if (!drl)
625 goto critical_error;
626 slots = 2;
627 }
628 ds++;
629 /* Need to set vcn if it isn't set already. */
630 if (slots != 1)
631 drl[ds].vcn = drl[ds - 1].vcn +
632 drl[ds - 1].length;
633 drl[ds].lcn = LCN_RL_NOT_MAPPED;
634 /* We now used up a slot. */
635 slots--;
636 }
637 drl[ds].length = marker_vcn - drl[ds].vcn;
638 /* Finally add the ENOENT terminator. */
639 ds++;
640 if (!slots) {
641 /* FIXME/TODO: We need to have the extra
642 * memory already! (AIA) */
643 drl = ntfs_rl_realloc(drl, ds, ds + 1);
644 if (!drl)
645 goto critical_error;
646 }
647 drl[ds].vcn = marker_vcn;
648 drl[ds].lcn = LCN_ENOENT;
649 drl[ds].length = (s64)0;
650 }
651 }
652 }
653
654finished:
655 /* The merge was completed successfully. */
656 ntfs_debug("Merged runlist:");
657 ntfs_debug_dump_runlist(drl);
658 return drl;
659
660critical_error:
661 /* Critical error! We cannot afford to fail here. */
662 ntfs_error(NULL, "Critical error! Not enough memory.");
663 panic("NTFS: Cannot continue.");
664}
665
666/**
667 * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist
668 * @vol: ntfs volume on which the attribute resides
669 * @attr: attribute record whose mapping pairs array to decompress
670 * @old_rl: optional runlist in which to insert @attr's runlist
671 *
672 * It is up to the caller to serialize access to the runlist @old_rl.
673 *
674 * Decompress the attribute @attr's mapping pairs array into a runlist. On
675 * success, return the decompressed runlist.
676 *
677 * If @old_rl is not NULL, decompressed runlist is inserted into the
678 * appropriate place in @old_rl and the resultant, combined runlist is
679 * returned. The original @old_rl is deallocated.
680 *
681 * On error, return -errno. @old_rl is left unmodified in that case.
682 *
683 * The following error codes are defined:
684 * -ENOMEM - Not enough memory to allocate runlist array.
685 * -EIO - Corrupt runlist.
686 * -EINVAL - Invalid parameters were passed in.
687 * -ERANGE - The two runlists overlap.
688 *
689 * FIXME: For now we take the conceptionally simplest approach of creating the
690 * new runlist disregarding the already existing one and then splicing the
691 * two into one, if that is possible (we check for overlap and discard the new
692 * runlist if overlap present before returning ERR_PTR(-ERANGE)).
693 */
694runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
695 const ATTR_RECORD *attr, runlist_element *old_rl)
696{
697 VCN vcn; /* Current vcn. */
698 LCN lcn; /* Current lcn. */
699 s64 deltaxcn; /* Change in [vl]cn. */
700 runlist_element *rl; /* The output runlist. */
701 u8 *buf; /* Current position in mapping pairs array. */
702 u8 *attr_end; /* End of attribute. */
703 int rlsize; /* Size of runlist buffer. */
704 u16 rlpos; /* Current runlist position in units of
705 runlist_elements. */
706 u8 b; /* Current byte offset in buf. */
707
708#ifdef DEBUG
709 /* Make sure attr exists and is non-resident. */
710 if (!attr || !attr->non_resident || sle64_to_cpu(
711 attr->data.non_resident.lowest_vcn) < (VCN)0) {
712 ntfs_error(vol->sb, "Invalid arguments.");
713 return ERR_PTR(-EINVAL);
714 }
715#endif
716 /* Start at vcn = lowest_vcn and lcn 0. */
717 vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn);
718 lcn = 0;
719 /* Get start of the mapping pairs array. */
720 buf = (u8*)attr + le16_to_cpu(
721 attr->data.non_resident.mapping_pairs_offset);
722 attr_end = (u8*)attr + le32_to_cpu(attr->length);
723 if (unlikely(buf < (u8*)attr || buf > attr_end)) {
724 ntfs_error(vol->sb, "Corrupt attribute.");
725 return ERR_PTR(-EIO);
726 }
727 /* Current position in runlist array. */
728 rlpos = 0;
729 /* Allocate first page and set current runlist size to one page. */
730 rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE);
731 if (unlikely(!rl))
732 return ERR_PTR(-ENOMEM);
733 /* Insert unmapped starting element if necessary. */
734 if (vcn) {
735 rl->vcn = 0;
736 rl->lcn = LCN_RL_NOT_MAPPED;
737 rl->length = vcn;
738 rlpos++;
739 }
740 while (buf < attr_end && *buf) {
741 /*
742 * Allocate more memory if needed, including space for the
743 * not-mapped and terminator elements. ntfs_malloc_nofs()
744 * operates on whole pages only.
745 */
746 if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) {
747 runlist_element *rl2;
748
749 rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
750 if (unlikely(!rl2)) {
751 ntfs_free(rl);
752 return ERR_PTR(-ENOMEM);
753 }
754 memcpy(rl2, rl, rlsize);
755 ntfs_free(rl);
756 rl = rl2;
757 rlsize += PAGE_SIZE;
758 }
759 /* Enter the current vcn into the current runlist element. */
760 rl[rlpos].vcn = vcn;
761 /*
762 * Get the change in vcn, i.e. the run length in clusters.
763 * Doing it this way ensures that we signextend negative values.
764 * A negative run length doesn't make any sense, but hey, I
765 * didn't make up the NTFS specs and Windows NT4 treats the run
766 * length as a signed value so that's how it is...
767 */
768 b = *buf & 0xf;
769 if (b) {
770 if (unlikely(buf + b > attr_end))
771 goto io_error;
772 for (deltaxcn = (s8)buf[b--]; b; b--)
773 deltaxcn = (deltaxcn << 8) + buf[b];
774 } else { /* The length entry is compulsory. */
775 ntfs_error(vol->sb, "Missing length entry in mapping "
776 "pairs array.");
777 deltaxcn = (s64)-1;
778 }
779 /*
780 * Assume a negative length to indicate data corruption and
781 * hence clean-up and return NULL.
782 */
783 if (unlikely(deltaxcn < 0)) {
784 ntfs_error(vol->sb, "Invalid length in mapping pairs "
785 "array.");
786 goto err_out;
787 }
788 /*
789 * Enter the current run length into the current runlist
790 * element.
791 */
792 rl[rlpos].length = deltaxcn;
793 /* Increment the current vcn by the current run length. */
794 vcn += deltaxcn;
795 /*
796 * There might be no lcn change at all, as is the case for
797 * sparse clusters on NTFS 3.0+, in which case we set the lcn
798 * to LCN_HOLE.
799 */
800 if (!(*buf & 0xf0))
801 rl[rlpos].lcn = LCN_HOLE;
802 else {
803 /* Get the lcn change which really can be negative. */
804 u8 b2 = *buf & 0xf;
805 b = b2 + ((*buf >> 4) & 0xf);
806 if (buf + b > attr_end)
807 goto io_error;
808 for (deltaxcn = (s8)buf[b--]; b > b2; b--)
809 deltaxcn = (deltaxcn << 8) + buf[b];
810 /* Change the current lcn to its new value. */
811 lcn += deltaxcn;
812#ifdef DEBUG
813 /*
814 * On NTFS 1.2-, apparently can have lcn == -1 to
815 * indicate a hole. But we haven't verified ourselves
816 * whether it is really the lcn or the deltaxcn that is
817 * -1. So if either is found give us a message so we
818 * can investigate it further!
819 */
820 if (vol->major_ver < 3) {
821 if (unlikely(deltaxcn == (LCN)-1))
822 ntfs_error(vol->sb, "lcn delta == -1");
823 if (unlikely(lcn == (LCN)-1))
824 ntfs_error(vol->sb, "lcn == -1");
825 }
826#endif
827 /* Check lcn is not below -1. */
828 if (unlikely(lcn < (LCN)-1)) {
829 ntfs_error(vol->sb, "Invalid LCN < -1 in "
830 "mapping pairs array.");
831 goto err_out;
832 }
833 /* Enter the current lcn into the runlist element. */
834 rl[rlpos].lcn = lcn;
835 }
836 /* Get to the next runlist element. */
837 rlpos++;
838 /* Increment the buffer position to the next mapping pair. */
839 buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1;
840 }
841 if (unlikely(buf >= attr_end))
842 goto io_error;
843 /*
844 * If there is a highest_vcn specified, it must be equal to the final
845 * vcn in the runlist - 1, or something has gone badly wrong.
846 */
847 deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn);
848 if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) {
849mpa_err:
850 ntfs_error(vol->sb, "Corrupt mapping pairs array in "
851 "non-resident attribute.");
852 goto err_out;
853 }
854 /* Setup not mapped runlist element if this is the base extent. */
855 if (!attr->data.non_resident.lowest_vcn) {
856 VCN max_cluster;
857
858 max_cluster = (sle64_to_cpu(
859 attr->data.non_resident.allocated_size) +
860 vol->cluster_size - 1) >>
861 vol->cluster_size_bits;
862 /*
863 * If there is a difference between the highest_vcn and the
864 * highest cluster, the runlist is either corrupt or, more
865 * likely, there are more extents following this one.
866 */
867 if (deltaxcn < --max_cluster) {
868 ntfs_debug("More extents to follow; deltaxcn = 0x%llx, "
869 "max_cluster = 0x%llx",
870 (unsigned long long)deltaxcn,
871 (unsigned long long)max_cluster);
872 rl[rlpos].vcn = vcn;
873 vcn += rl[rlpos].length = max_cluster - deltaxcn;
874 rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
875 rlpos++;
876 } else if (unlikely(deltaxcn > max_cluster)) {
877 ntfs_error(vol->sb, "Corrupt attribute. deltaxcn = "
878 "0x%llx, max_cluster = 0x%llx",
879 (unsigned long long)deltaxcn,
880 (unsigned long long)max_cluster);
881 goto mpa_err;
882 }
883 rl[rlpos].lcn = LCN_ENOENT;
884 } else /* Not the base extent. There may be more extents to follow. */
885 rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
886
887 /* Setup terminating runlist element. */
888 rl[rlpos].vcn = vcn;
889 rl[rlpos].length = (s64)0;
890 /* If no existing runlist was specified, we are done. */
891 if (!old_rl) {
892 ntfs_debug("Mapping pairs array successfully decompressed:");
893 ntfs_debug_dump_runlist(rl);
894 return rl;
895 }
896 /* Now combine the new and old runlists checking for overlaps. */
897 old_rl = ntfs_runlists_merge(old_rl, rl);
898 if (likely(!IS_ERR(old_rl)))
899 return old_rl;
900 ntfs_free(rl);
901 ntfs_error(vol->sb, "Failed to merge runlists.");
902 return old_rl;
903io_error:
904 ntfs_error(vol->sb, "Corrupt attribute.");
905err_out:
906 ntfs_free(rl);
907 return ERR_PTR(-EIO);
908}
909
910/**
911 * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist
912 * @rl: runlist to use for conversion
913 * @vcn: vcn to convert
914 *
915 * Convert the virtual cluster number @vcn of an attribute into a logical
916 * cluster number (lcn) of a device using the runlist @rl to map vcns to their
917 * corresponding lcns.
918 *
919 * It is up to the caller to serialize access to the runlist @rl.
920 *
921 * Since lcns must be >= 0, we use negative return values with special meaning:
922 *
923 * Return value Meaning / Description
924 * ==================================================
925 * -1 = LCN_HOLE Hole / not allocated on disk.
926 * -2 = LCN_RL_NOT_MAPPED This is part of the runlist which has not been
927 * inserted into the runlist yet.
928 * -3 = LCN_ENOENT There is no such vcn in the attribute.
929 *
930 * Locking: - The caller must have locked the runlist (for reading or writing).
931 * - This function does not touch the lock.
932 */
933LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn)
934{
935 int i;
936
937 BUG_ON(vcn < 0);
938 /*
939 * If rl is NULL, assume that we have found an unmapped runlist. The
940 * caller can then attempt to map it and fail appropriately if
941 * necessary.
942 */
943 if (unlikely(!rl))
944 return LCN_RL_NOT_MAPPED;
945
946 /* Catch out of lower bounds vcn. */
947 if (unlikely(vcn < rl[0].vcn))
948 return LCN_ENOENT;
949
950 for (i = 0; likely(rl[i].length); i++) {
951 if (unlikely(vcn < rl[i+1].vcn)) {
952 if (likely(rl[i].lcn >= (LCN)0))
953 return rl[i].lcn + (vcn - rl[i].vcn);
954 return rl[i].lcn;
955 }
956 }
957 /*
958 * The terminator element is setup to the correct value, i.e. one of
959 * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT.
960 */
961 if (likely(rl[i].lcn < (LCN)0))
962 return rl[i].lcn;
963 /* Just in case... We could replace this with BUG() some day. */
964 return LCN_ENOENT;
965}
966
967/**
968 * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number
969 * @n: number for which to get the number of bytes for
970 *
971 * Return the number of bytes required to store @n unambiguously as
972 * a signed number.
973 *
974 * This is used in the context of the mapping pairs array to determine how
975 * many bytes will be needed in the array to store a given logical cluster
976 * number (lcn) or a specific run length.
977 *
978 * Return the number of bytes written. This function cannot fail.
979 */
980static inline int ntfs_get_nr_significant_bytes(const s64 n)
981{
982 s64 l = n;
983 int i;
984 s8 j;
985
986 i = 0;
987 do {
988 l >>= 8;
989 i++;
990 } while (l != 0 && l != -1);
991 j = (n >> 8 * (i - 1)) & 0xff;
992 /* If the sign bit is wrong, we need an extra byte. */
993 if ((n < 0 && j >= 0) || (n > 0 && j < 0))
994 i++;
995 return i;
996}
997
998/**
999 * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array
1000 * @vol: ntfs volume (needed for the ntfs version)
1001 * @rl: locked runlist to determine the size of the mapping pairs of
1002 * @start_vcn: vcn at which to start the mapping pairs array
1003 *
1004 * Walk the locked runlist @rl and calculate the size in bytes of the mapping
1005 * pairs array corresponding to the runlist @rl, starting at vcn @start_vcn.
1006 * This for example allows us to allocate a buffer of the right size when
1007 * building the mapping pairs array.
1008 *
1009 * If @rl is NULL, just return 1 (for the single terminator byte).
1010 *
1011 * Return the calculated size in bytes on success. On error, return -errno.
1012 * The following error codes are defined:
1013 * -EINVAL - Run list contains unmapped elements. Make sure to only pass
1014 * fully mapped runlists to this function.
1015 * -EIO - The runlist is corrupt.
1016 *
1017 * Locking: @rl must be locked on entry (either for reading or writing), it
1018 * remains locked throughout, and is left locked upon return.
1019 */
1020int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
1021 const runlist_element *rl, const VCN start_vcn)
1022{
1023 LCN prev_lcn;
1024 int rls;
1025
1026 BUG_ON(start_vcn < 0);
1027 if (!rl) {
1028 BUG_ON(start_vcn);
1029 return 1;
1030 }
1031 /* Skip to runlist element containing @start_vcn. */
1032 while (rl->length && start_vcn >= rl[1].vcn)
1033 rl++;
1034 if ((!rl->length && start_vcn > rl->vcn) || start_vcn < rl->vcn)
1035 return -EINVAL;
1036 prev_lcn = 0;
1037 /* Always need the termining zero byte. */
1038 rls = 1;
1039 /* Do the first partial run if present. */
1040 if (start_vcn > rl->vcn) {
1041 s64 delta;
1042
1043 /* We know rl->length != 0 already. */
1044 if (rl->length < 0 || rl->lcn < LCN_HOLE)
1045 goto err_out;
1046 delta = start_vcn - rl->vcn;
1047 /* Header byte + length. */
1048 rls += 1 + ntfs_get_nr_significant_bytes(rl->length - delta);
1049 /*
1050 * If the logical cluster number (lcn) denotes a hole and we
1051 * are on NTFS 3.0+, we don't store it at all, i.e. we need
1052 * zero space. On earlier NTFS versions we just store the lcn.
1053 * Note: this assumes that on NTFS 1.2-, holes are stored with
1054 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
1055 */
1056 if (rl->lcn >= 0 || vol->major_ver < 3) {
1057 prev_lcn = rl->lcn;
1058 if (rl->lcn >= 0)
1059 prev_lcn += delta;
1060 /* Change in lcn. */
1061 rls += ntfs_get_nr_significant_bytes(prev_lcn);
1062 }
1063 /* Go to next runlist element. */
1064 rl++;
1065 }
1066 /* Do the full runs. */
1067 for (; rl->length; rl++) {
1068 if (rl->length < 0 || rl->lcn < LCN_HOLE)
1069 goto err_out;
1070 /* Header byte + length. */
1071 rls += 1 + ntfs_get_nr_significant_bytes(rl->length);
1072 /*
1073 * If the logical cluster number (lcn) denotes a hole and we
1074 * are on NTFS 3.0+, we don't store it at all, i.e. we need
1075 * zero space. On earlier NTFS versions we just store the lcn.
1076 * Note: this assumes that on NTFS 1.2-, holes are stored with
1077 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
1078 */
1079 if (rl->lcn >= 0 || vol->major_ver < 3) {
1080 /* Change in lcn. */
1081 rls += ntfs_get_nr_significant_bytes(rl->lcn -
1082 prev_lcn);
1083 prev_lcn = rl->lcn;
1084 }
1085 }
1086 return rls;
1087err_out:
1088 if (rl->lcn == LCN_RL_NOT_MAPPED)
1089 rls = -EINVAL;
1090 else
1091 rls = -EIO;
1092 return rls;
1093}
1094
1095/**
1096 * ntfs_write_significant_bytes - write the significant bytes of a number
1097 * @dst: destination buffer to write to
1098 * @dst_max: pointer to last byte of destination buffer for bounds checking
1099 * @n: number whose significant bytes to write
1100 *
1101 * Store in @dst, the minimum bytes of the number @n which are required to
1102 * identify @n unambiguously as a signed number, taking care not to exceed
1103 * @dest_max, the maximum position within @dst to which we are allowed to
1104 * write.
1105 *
1106 * This is used when building the mapping pairs array of a runlist to compress
1107 * a given logical cluster number (lcn) or a specific run length to the minumum
1108 * size possible.
1109 *
1110 * Return the number of bytes written on success. On error, i.e. the
1111 * destination buffer @dst is too small, return -ENOSPC.
1112 */
1113static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max,
1114 const s64 n)
1115{
1116 s64 l = n;
1117 int i;
1118 s8 j;
1119
1120 i = 0;
1121 do {
1122 if (dst > dst_max)
1123 goto err_out;
1124 *dst++ = l & 0xffll;
1125 l >>= 8;
1126 i++;
1127 } while (l != 0 && l != -1);
1128 j = (n >> 8 * (i - 1)) & 0xff;
1129 /* If the sign bit is wrong, we need an extra byte. */
1130 if (n < 0 && j >= 0) {
1131 if (dst > dst_max)
1132 goto err_out;
1133 i++;
1134 *dst = (s8)-1;
1135 } else if (n > 0 && j < 0) {
1136 if (dst > dst_max)
1137 goto err_out;
1138 i++;
1139 *dst = (s8)0;
1140 }
1141 return i;
1142err_out:
1143 return -ENOSPC;
1144}
1145
1146/**
1147 * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist
1148 * @vol: ntfs volume (needed for the ntfs version)
1149 * @dst: destination buffer to which to write the mapping pairs array
1150 * @dst_len: size of destination buffer @dst in bytes
1151 * @rl: locked runlist for which to build the mapping pairs array
1152 * @start_vcn: vcn at which to start the mapping pairs array
1153 * @stop_vcn: first vcn outside destination buffer on success or -ENOSPC
1154 *
1155 * Create the mapping pairs array from the locked runlist @rl, starting at vcn
1156 * @start_vcn and save the array in @dst. @dst_len is the size of @dst in
1157 * bytes and it should be at least equal to the value obtained by calling
1158 * ntfs_get_size_for_mapping_pairs().
1159 *
1160 * If @rl is NULL, just write a single terminator byte to @dst.
1161 *
1162 * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to
1163 * the first vcn outside the destination buffer. Note that on error, @dst has
1164 * been filled with all the mapping pairs that will fit, thus it can be treated
1165 * as partial success, in that a new attribute extent needs to be created or
1166 * the next extent has to be used and the mapping pairs build has to be
1167 * continued with @start_vcn set to *@stop_vcn.
1168 *
1169 * Return 0 on success and -errno on error. The following error codes are
1170 * defined:
1171 * -EINVAL - Run list contains unmapped elements. Make sure to only pass
1172 * fully mapped runlists to this function.
1173 * -EIO - The runlist is corrupt.
1174 * -ENOSPC - The destination buffer is too small.
1175 *
1176 * Locking: @rl must be locked on entry (either for reading or writing), it
1177 * remains locked throughout, and is left locked upon return.
1178 */
1179int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
1180 const int dst_len, const runlist_element *rl,
1181 const VCN start_vcn, VCN *const stop_vcn)
1182{
1183 LCN prev_lcn;
1184 s8 *dst_max, *dst_next;
1185 int err = -ENOSPC;
1186 s8 len_len, lcn_len;
1187
1188 BUG_ON(start_vcn < 0);
1189 BUG_ON(dst_len < 1);
1190 if (!rl) {
1191 BUG_ON(start_vcn);
1192 if (stop_vcn)
1193 *stop_vcn = 0;
1194 /* Terminator byte. */
1195 *dst = 0;
1196 return 0;
1197 }
1198 /* Skip to runlist element containing @start_vcn. */
1199 while (rl->length && start_vcn >= rl[1].vcn)
1200 rl++;
1201 if ((!rl->length && start_vcn > rl->vcn) || start_vcn < rl->vcn)
1202 return -EINVAL;
1203 /*
1204 * @dst_max is used for bounds checking in
1205 * ntfs_write_significant_bytes().
1206 */
1207 dst_max = dst + dst_len - 1;
1208 prev_lcn = 0;
1209 /* Do the first partial run if present. */
1210 if (start_vcn > rl->vcn) {
1211 s64 delta;
1212
1213 /* We know rl->length != 0 already. */
1214 if (rl->length < 0 || rl->lcn < LCN_HOLE)
1215 goto err_out;
1216 delta = start_vcn - rl->vcn;
1217 /* Write length. */
1218 len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
1219 rl->length - delta);
1220 if (len_len < 0)
1221 goto size_err;
1222 /*
1223 * If the logical cluster number (lcn) denotes a hole and we
1224 * are on NTFS 3.0+, we don't store it at all, i.e. we need
1225 * zero space. On earlier NTFS versions we just write the lcn
1226 * change. FIXME: Do we need to write the lcn change or just
1227 * the lcn in that case? Not sure as I have never seen this
1228 * case on NT4. - We assume that we just need to write the lcn
1229 * change until someone tells us otherwise... (AIA)
1230 */
1231 if (rl->lcn >= 0 || vol->major_ver < 3) {
1232 prev_lcn = rl->lcn;
1233 if (rl->lcn >= 0)
1234 prev_lcn += delta;
1235 /* Write change in lcn. */
1236 lcn_len = ntfs_write_significant_bytes(dst + 1 +
1237 len_len, dst_max, prev_lcn);
1238 if (lcn_len < 0)
1239 goto size_err;
1240 } else
1241 lcn_len = 0;
1242 dst_next = dst + len_len + lcn_len + 1;
1243 if (dst_next > dst_max)
1244 goto size_err;
1245 /* Update header byte. */
1246 *dst = lcn_len << 4 | len_len;
1247 /* Position at next mapping pairs array element. */
1248 dst = dst_next;
1249 /* Go to next runlist element. */
1250 rl++;
1251 }
1252 /* Do the full runs. */
1253 for (; rl->length; rl++) {
1254 if (rl->length < 0 || rl->lcn < LCN_HOLE)
1255 goto err_out;
1256 /* Write length. */
1257 len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
1258 rl->length);
1259 if (len_len < 0)
1260 goto size_err;
1261 /*
1262 * If the logical cluster number (lcn) denotes a hole and we
1263 * are on NTFS 3.0+, we don't store it at all, i.e. we need
1264 * zero space. On earlier NTFS versions we just write the lcn
1265 * change. FIXME: Do we need to write the lcn change or just
1266 * the lcn in that case? Not sure as I have never seen this
1267 * case on NT4. - We assume that we just need to write the lcn
1268 * change until someone tells us otherwise... (AIA)
1269 */
1270 if (rl->lcn >= 0 || vol->major_ver < 3) {
1271 /* Write change in lcn. */
1272 lcn_len = ntfs_write_significant_bytes(dst + 1 +
1273 len_len, dst_max, rl->lcn - prev_lcn);
1274 if (lcn_len < 0)
1275 goto size_err;
1276 prev_lcn = rl->lcn;
1277 } else
1278 lcn_len = 0;
1279 dst_next = dst + len_len + lcn_len + 1;
1280 if (dst_next > dst_max)
1281 goto size_err;
1282 /* Update header byte. */
1283 *dst = lcn_len << 4 | len_len;
1284 /* Position at next mapping pairs array element. */
1285 dst = dst_next;
1286 }
1287 /* Success. */
1288 err = 0;
1289size_err:
1290 /* Set stop vcn. */
1291 if (stop_vcn)
1292 *stop_vcn = rl->vcn;
1293 /* Add terminator byte. */
1294 *dst = 0;
1295 return err;
1296err_out:
1297 if (rl->lcn == LCN_RL_NOT_MAPPED)
1298 err = -EINVAL;
1299 else
1300 err = -EIO;
1301 return err;
1302}
1303
1304/**
1305 * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn
1306 * @runlist: runlist to truncate
1307 * @new_length: the new length of the runlist in VCNs
1308 *
1309 * Truncate the runlist described by @runlist as well as the memory buffer
1310 * holding the runlist elements to a length of @new_length VCNs.
1311 *
1312 * If @new_length lies within the runlist, the runlist elements with VCNs of
1313 * @new_length and above are discarded.
1314 *
1315 * If @new_length lies beyond the runlist, a sparse runlist element is added to
1316 * the end of the runlist @runlist or if the last runlist element is a sparse
1317 * one already, this is extended.
1318 *
1319 * Return 0 on success and -errno on error.
1320 *
1321 * Locking: The caller must hold @runlist->lock for writing.
1322 */
1323int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
1324 const s64 new_length)
1325{
1326 runlist_element *rl;
1327 int old_size;
1328
1329 ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length);
1330 BUG_ON(!runlist);
1331 BUG_ON(new_length < 0);
1332 rl = runlist->rl;
1333 if (unlikely(!rl)) {
1334 /*
1335 * Create a runlist consisting of a sparse runlist element of
1336 * length @new_length followed by a terminator runlist element.
1337 */
1338 rl = ntfs_malloc_nofs(PAGE_SIZE);
1339 if (unlikely(!rl)) {
1340 ntfs_error(vol->sb, "Not enough memory to allocate "
1341 "runlist element buffer.");
1342 return -ENOMEM;
1343 }
1344 runlist->rl = rl;
1345 rl[1].length = rl->vcn = 0;
1346 rl->lcn = LCN_HOLE;
1347 rl[1].vcn = rl->length = new_length;
1348 rl[1].lcn = LCN_ENOENT;
1349 return 0;
1350 }
1351 BUG_ON(new_length < rl->vcn);
1352 /* Find @new_length in the runlist. */
1353 while (likely(rl->length && new_length >= rl[1].vcn))
1354 rl++;
1355 /*
1356 * If not at the end of the runlist we need to shrink it.
1357 * If at the end of the runlist we need to expand it.
1358 */
1359 if (rl->length) {
1360 runlist_element *trl;
1361 BOOL is_end;
1362
1363 ntfs_debug("Shrinking runlist.");
1364 /* Determine the runlist size. */
1365 trl = rl + 1;
1366 while (likely(trl->length))
1367 trl++;
1368 old_size = trl - runlist->rl + 1;
1369 /* Truncate the run. */
1370 rl->length = new_length - rl->vcn;
1371 /*
1372 * If a run was partially truncated, make the following runlist
1373 * element a terminator.
1374 */
1375 is_end = FALSE;
1376 if (rl->length) {
1377 rl++;
1378 if (!rl->length)
1379 is_end = TRUE;
1380 rl->vcn = new_length;
1381 rl->length = 0;
1382 }
1383 rl->lcn = LCN_ENOENT;
1384 /* Reallocate memory if necessary. */
1385 if (!is_end) {
1386 int new_size = rl - runlist->rl + 1;
1387 rl = ntfs_rl_realloc(runlist->rl, old_size, new_size);
1388 if (IS_ERR(rl))
1389 ntfs_warning(vol->sb, "Failed to shrink "
1390 "runlist buffer. This just "
1391 "wastes a bit of memory "
1392 "temporarily so we ignore it "
1393 "and return success.");
1394 else
1395 runlist->rl = rl;
1396 }
1397 } else if (likely(/* !rl->length && */ new_length > rl->vcn)) {
1398 ntfs_debug("Expanding runlist.");
1399 /*
1400 * If there is a previous runlist element and it is a sparse
1401 * one, extend it. Otherwise need to add a new, sparse runlist
1402 * element.
1403 */
1404 if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE))
1405 (rl - 1)->length = new_length - (rl - 1)->vcn;
1406 else {
1407 /* Determine the runlist size. */
1408 old_size = rl - runlist->rl + 1;
1409 /* Reallocate memory if necessary. */
1410 rl = ntfs_rl_realloc(runlist->rl, old_size,
1411 old_size + 1);
1412 if (IS_ERR(rl)) {
1413 ntfs_error(vol->sb, "Failed to expand runlist "
1414 "buffer, aborting.");
1415 return PTR_ERR(rl);
1416 }
1417 runlist->rl = rl;
1418 /*
1419 * Set @rl to the same runlist element in the new
1420 * runlist as before in the old runlist.
1421 */
1422 rl += old_size - 1;
1423 /* Add a new, sparse runlist element. */
1424 rl->lcn = LCN_HOLE;
1425 rl->length = new_length - rl->vcn;
1426 /* Add a new terminator runlist element. */
1427 rl++;
1428 rl->length = 0;
1429 }
1430 rl->vcn = new_length;
1431 rl->lcn = LCN_ENOENT;
1432 } else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ {
1433 /* Runlist already has same size as requested. */
1434 rl->lcn = LCN_ENOENT;
1435 }
1436 ntfs_debug("Done.");
1437 return 0;
1438}
diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h
new file mode 100644
index 000000000000..7107fde59df9
--- /dev/null
+++ b/fs/ntfs/runlist.h
@@ -0,0 +1,89 @@
1/*
2 * runlist.h - Defines for runlist handling in NTFS Linux kernel driver.
3 * Part of the Linux-NTFS project.
4 *
5 * Copyright (c) 2001-2004 Anton Altaparmakov
6 * Copyright (c) 2002 Richard Russon
7 *
8 * This program/include file is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as published
10 * by the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program/include file is distributed in the hope that it will be
14 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program (in the main directory of the Linux-NTFS
20 * distribution in the file COPYING); if not, write to the Free Software
21 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24#ifndef _LINUX_NTFS_RUNLIST_H
25#define _LINUX_NTFS_RUNLIST_H
26
27#include "types.h"
28#include "layout.h"
29#include "volume.h"
30
31/**
32 * runlist_element - in memory vcn to lcn mapping array element
33 * @vcn: starting vcn of the current array element
34 * @lcn: starting lcn of the current array element
35 * @length: length in clusters of the current array element
36 *
37 * The last vcn (in fact the last vcn + 1) is reached when length == 0.
38 *
39 * When lcn == -1 this means that the count vcns starting at vcn are not
40 * physically allocated (i.e. this is a hole / data is sparse).
41 */
42typedef struct { /* In memory vcn to lcn mapping structure element. */
43 VCN vcn; /* vcn = Starting virtual cluster number. */
44 LCN lcn; /* lcn = Starting logical cluster number. */
45 s64 length; /* Run length in clusters. */
46} runlist_element;
47
48/**
49 * runlist - in memory vcn to lcn mapping array including a read/write lock
50 * @rl: pointer to an array of runlist elements
51 * @lock: read/write spinlock for serializing access to @rl
52 *
53 */
54typedef struct {
55 runlist_element *rl;
56 struct rw_semaphore lock;
57} runlist;
58
59static inline void ntfs_init_runlist(runlist *rl)
60{
61 rl->rl = NULL;
62 init_rwsem(&rl->lock);
63}
64
65typedef enum {
66 LCN_HOLE = -1, /* Keep this as highest value or die! */
67 LCN_RL_NOT_MAPPED = -2,
68 LCN_ENOENT = -3,
69} LCN_SPECIAL_VALUES;
70
71extern runlist_element *ntfs_runlists_merge(runlist_element *drl,
72 runlist_element *srl);
73
74extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
75 const ATTR_RECORD *attr, runlist_element *old_rl);
76
77extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn);
78
79extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
80 const runlist_element *rl, const VCN start_vcn);
81
82extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
83 const int dst_len, const runlist_element *rl,
84 const VCN start_vcn, VCN *const stop_vcn);
85
86extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol,
87 runlist *const runlist, const s64 new_length);
88
89#endif /* _LINUX_NTFS_RUNLIST_H */
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
new file mode 100644
index 000000000000..212a3d0f2073
--- /dev/null
+++ b/fs/ntfs/super.c
@@ -0,0 +1,2771 @@
1/*
2 * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2001-2004 Anton Altaparmakov
5 * Copyright (c) 2001,2002 Richard Russon
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#include <linux/stddef.h>
24#include <linux/init.h>
25#include <linux/string.h>
26#include <linux/spinlock.h>
27#include <linux/blkdev.h> /* For bdev_hardsect_size(). */
28#include <linux/backing-dev.h>
29#include <linux/buffer_head.h>
30#include <linux/vfs.h>
31#include <linux/moduleparam.h>
32#include <linux/smp_lock.h>
33
34#include "sysctl.h"
35#include "logfile.h"
36#include "quota.h"
37#include "dir.h"
38#include "debug.h"
39#include "index.h"
40#include "aops.h"
41#include "malloc.h"
42#include "ntfs.h"
43
44/* Number of mounted file systems which have compression enabled. */
45static unsigned long ntfs_nr_compression_users;
46
47/* A global default upcase table and a corresponding reference count. */
48static ntfschar *default_upcase = NULL;
49static unsigned long ntfs_nr_upcase_users = 0;
50
51/* Error constants/strings used in inode.c::ntfs_show_options(). */
52typedef enum {
53 /* One of these must be present, default is ON_ERRORS_CONTINUE. */
54 ON_ERRORS_PANIC = 0x01,
55 ON_ERRORS_REMOUNT_RO = 0x02,
56 ON_ERRORS_CONTINUE = 0x04,
57 /* Optional, can be combined with any of the above. */
58 ON_ERRORS_RECOVER = 0x10,
59} ON_ERRORS_ACTIONS;
60
61const option_t on_errors_arr[] = {
62 { ON_ERRORS_PANIC, "panic" },
63 { ON_ERRORS_REMOUNT_RO, "remount-ro", },
64 { ON_ERRORS_CONTINUE, "continue", },
65 { ON_ERRORS_RECOVER, "recover" },
66 { 0, NULL }
67};
68
69/**
70 * simple_getbool -
71 *
72 * Copied from old ntfs driver (which copied from vfat driver).
73 */
74static int simple_getbool(char *s, BOOL *setval)
75{
76 if (s) {
77 if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true"))
78 *setval = TRUE;
79 else if (!strcmp(s, "0") || !strcmp(s, "no") ||
80 !strcmp(s, "false"))
81 *setval = FALSE;
82 else
83 return 0;
84 } else
85 *setval = TRUE;
86 return 1;
87}
88
89/**
90 * parse_options - parse the (re)mount options
91 * @vol: ntfs volume
92 * @opt: string containing the (re)mount options
93 *
94 * Parse the recognized options in @opt for the ntfs volume described by @vol.
95 */
96static BOOL parse_options(ntfs_volume *vol, char *opt)
97{
98 char *p, *v, *ov;
99 static char *utf8 = "utf8";
100 int errors = 0, sloppy = 0;
101 uid_t uid = (uid_t)-1;
102 gid_t gid = (gid_t)-1;
103 mode_t fmask = (mode_t)-1, dmask = (mode_t)-1;
104 int mft_zone_multiplier = -1, on_errors = -1;
105 int show_sys_files = -1, case_sensitive = -1;
106 struct nls_table *nls_map = NULL, *old_nls;
107
108 /* I am lazy... (-8 */
109#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value) \
110 if (!strcmp(p, option)) { \
111 if (!v || !*v) \
112 variable = default_value; \
113 else { \
114 variable = simple_strtoul(ov = v, &v, 0); \
115 if (*v) \
116 goto needs_val; \
117 } \
118 }
119#define NTFS_GETOPT(option, variable) \
120 if (!strcmp(p, option)) { \
121 if (!v || !*v) \
122 goto needs_arg; \
123 variable = simple_strtoul(ov = v, &v, 0); \
124 if (*v) \
125 goto needs_val; \
126 }
127#define NTFS_GETOPT_BOOL(option, variable) \
128 if (!strcmp(p, option)) { \
129 BOOL val; \
130 if (!simple_getbool(v, &val)) \
131 goto needs_bool; \
132 variable = val; \
133 }
134#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array) \
135 if (!strcmp(p, option)) { \
136 int _i; \
137 if (!v || !*v) \
138 goto needs_arg; \
139 ov = v; \
140 if (variable == -1) \
141 variable = 0; \
142 for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \
143 if (!strcmp(opt_array[_i].str, v)) { \
144 variable |= opt_array[_i].val; \
145 break; \
146 } \
147 if (!opt_array[_i].str || !*opt_array[_i].str) \
148 goto needs_val; \
149 }
150 if (!opt || !*opt)
151 goto no_mount_options;
152 ntfs_debug("Entering with mount options string: %s", opt);
153 while ((p = strsep(&opt, ","))) {
154 if ((v = strchr(p, '=')))
155 *v++ = 0;
156 NTFS_GETOPT("uid", uid)
157 else NTFS_GETOPT("gid", gid)
158 else NTFS_GETOPT("umask", fmask = dmask)
159 else NTFS_GETOPT("fmask", fmask)
160 else NTFS_GETOPT("dmask", dmask)
161 else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier)
162 else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, TRUE)
163 else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files)
164 else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive)
165 else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors,
166 on_errors_arr)
167 else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes"))
168 ntfs_warning(vol->sb, "Ignoring obsolete option %s.",
169 p);
170 else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) {
171 if (!strcmp(p, "iocharset"))
172 ntfs_warning(vol->sb, "Option iocharset is "
173 "deprecated. Please use "
174 "option nls=<charsetname> in "
175 "the future.");
176 if (!v || !*v)
177 goto needs_arg;
178use_utf8:
179 old_nls = nls_map;
180 nls_map = load_nls(v);
181 if (!nls_map) {
182 if (!old_nls) {
183 ntfs_error(vol->sb, "NLS character set "
184 "%s not found.", v);
185 return FALSE;
186 }
187 ntfs_error(vol->sb, "NLS character set %s not "
188 "found. Using previous one %s.",
189 v, old_nls->charset);
190 nls_map = old_nls;
191 } else /* nls_map */ {
192 if (old_nls)
193 unload_nls(old_nls);
194 }
195 } else if (!strcmp(p, "utf8")) {
196 BOOL val = FALSE;
197 ntfs_warning(vol->sb, "Option utf8 is no longer "
198 "supported, using option nls=utf8. Please "
199 "use option nls=utf8 in the future and "
200 "make sure utf8 is compiled either as a "
201 "module or into the kernel.");
202 if (!v || !*v)
203 val = TRUE;
204 else if (!simple_getbool(v, &val))
205 goto needs_bool;
206 if (val) {
207 v = utf8;
208 goto use_utf8;
209 }
210 } else {
211 ntfs_error(vol->sb, "Unrecognized mount option %s.", p);
212 if (errors < INT_MAX)
213 errors++;
214 }
215#undef NTFS_GETOPT_OPTIONS_ARRAY
216#undef NTFS_GETOPT_BOOL
217#undef NTFS_GETOPT
218#undef NTFS_GETOPT_WITH_DEFAULT
219 }
220no_mount_options:
221 if (errors && !sloppy)
222 return FALSE;
223 if (sloppy)
224 ntfs_warning(vol->sb, "Sloppy option given. Ignoring "
225 "unrecognized mount option(s) and continuing.");
226 /* Keep this first! */
227 if (on_errors != -1) {
228 if (!on_errors) {
229 ntfs_error(vol->sb, "Invalid errors option argument "
230 "or bug in options parser.");
231 return FALSE;
232 }
233 }
234 if (nls_map) {
235 if (vol->nls_map && vol->nls_map != nls_map) {
236 ntfs_error(vol->sb, "Cannot change NLS character set "
237 "on remount.");
238 return FALSE;
239 } /* else (!vol->nls_map) */
240 ntfs_debug("Using NLS character set %s.", nls_map->charset);
241 vol->nls_map = nls_map;
242 } else /* (!nls_map) */ {
243 if (!vol->nls_map) {
244 vol->nls_map = load_nls_default();
245 if (!vol->nls_map) {
246 ntfs_error(vol->sb, "Failed to load default "
247 "NLS character set.");
248 return FALSE;
249 }
250 ntfs_debug("Using default NLS character set (%s).",
251 vol->nls_map->charset);
252 }
253 }
254 if (mft_zone_multiplier != -1) {
255 if (vol->mft_zone_multiplier && vol->mft_zone_multiplier !=
256 mft_zone_multiplier) {
257 ntfs_error(vol->sb, "Cannot change mft_zone_multiplier "
258 "on remount.");
259 return FALSE;
260 }
261 if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) {
262 ntfs_error(vol->sb, "Invalid mft_zone_multiplier. "
263 "Using default value, i.e. 1.");
264 mft_zone_multiplier = 1;
265 }
266 vol->mft_zone_multiplier = mft_zone_multiplier;
267 }
268 if (!vol->mft_zone_multiplier)
269 vol->mft_zone_multiplier = 1;
270 if (on_errors != -1)
271 vol->on_errors = on_errors;
272 if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER)
273 vol->on_errors |= ON_ERRORS_CONTINUE;
274 if (uid != (uid_t)-1)
275 vol->uid = uid;
276 if (gid != (gid_t)-1)
277 vol->gid = gid;
278 if (fmask != (mode_t)-1)
279 vol->fmask = fmask;
280 if (dmask != (mode_t)-1)
281 vol->dmask = dmask;
282 if (show_sys_files != -1) {
283 if (show_sys_files)
284 NVolSetShowSystemFiles(vol);
285 else
286 NVolClearShowSystemFiles(vol);
287 }
288 if (case_sensitive != -1) {
289 if (case_sensitive)
290 NVolSetCaseSensitive(vol);
291 else
292 NVolClearCaseSensitive(vol);
293 }
294 return TRUE;
295needs_arg:
296 ntfs_error(vol->sb, "The %s option requires an argument.", p);
297 return FALSE;
298needs_bool:
299 ntfs_error(vol->sb, "The %s option requires a boolean argument.", p);
300 return FALSE;
301needs_val:
302 ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov);
303 return FALSE;
304}
305
306#ifdef NTFS_RW
307
308/**
309 * ntfs_write_volume_flags - write new flags to the volume information flags
310 * @vol: ntfs volume on which to modify the flags
311 * @flags: new flags value for the volume information flags
312 *
313 * Internal function. You probably want to use ntfs_{set,clear}_volume_flags()
314 * instead (see below).
315 *
316 * Replace the volume information flags on the volume @vol with the value
317 * supplied in @flags. Note, this overwrites the volume information flags, so
318 * make sure to combine the flags you want to modify with the old flags and use
319 * the result when calling ntfs_write_volume_flags().
320 *
321 * Return 0 on success and -errno on error.
322 */
323static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags)
324{
325 ntfs_inode *ni = NTFS_I(vol->vol_ino);
326 MFT_RECORD *m;
327 VOLUME_INFORMATION *vi;
328 ntfs_attr_search_ctx *ctx;
329 int err;
330
331 ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.",
332 le16_to_cpu(vol->vol_flags), le16_to_cpu(flags));
333 if (vol->vol_flags == flags)
334 goto done;
335 BUG_ON(!ni);
336 m = map_mft_record(ni);
337 if (IS_ERR(m)) {
338 err = PTR_ERR(m);
339 goto err_out;
340 }
341 ctx = ntfs_attr_get_search_ctx(ni, m);
342 if (!ctx) {
343 err = -ENOMEM;
344 goto put_unm_err_out;
345 }
346 err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
347 ctx);
348 if (err)
349 goto put_unm_err_out;
350 vi = (VOLUME_INFORMATION*)((u8*)ctx->attr +
351 le16_to_cpu(ctx->attr->data.resident.value_offset));
352 vol->vol_flags = vi->flags = flags;
353 flush_dcache_mft_record_page(ctx->ntfs_ino);
354 mark_mft_record_dirty(ctx->ntfs_ino);
355 ntfs_attr_put_search_ctx(ctx);
356 unmap_mft_record(ni);
357done:
358 ntfs_debug("Done.");
359 return 0;
360put_unm_err_out:
361 if (ctx)
362 ntfs_attr_put_search_ctx(ctx);
363 unmap_mft_record(ni);
364err_out:
365 ntfs_error(vol->sb, "Failed with error code %i.", -err);
366 return err;
367}
368
369/**
370 * ntfs_set_volume_flags - set bits in the volume information flags
371 * @vol: ntfs volume on which to modify the flags
372 * @flags: flags to set on the volume
373 *
374 * Set the bits in @flags in the volume information flags on the volume @vol.
375 *
376 * Return 0 on success and -errno on error.
377 */
378static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
379{
380 flags &= VOLUME_FLAGS_MASK;
381 return ntfs_write_volume_flags(vol, vol->vol_flags | flags);
382}
383
384/**
385 * ntfs_clear_volume_flags - clear bits in the volume information flags
386 * @vol: ntfs volume on which to modify the flags
387 * @flags: flags to clear on the volume
388 *
389 * Clear the bits in @flags in the volume information flags on the volume @vol.
390 *
391 * Return 0 on success and -errno on error.
392 */
393static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
394{
395 flags &= VOLUME_FLAGS_MASK;
396 flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags));
397 return ntfs_write_volume_flags(vol, flags);
398}
399
400#endif /* NTFS_RW */
401
402/**
403 * ntfs_remount - change the mount options of a mounted ntfs filesystem
404 * @sb: superblock of mounted ntfs filesystem
405 * @flags: remount flags
406 * @opt: remount options string
407 *
408 * Change the mount options of an already mounted ntfs filesystem.
409 *
410 * NOTE: The VFS sets the @sb->s_flags remount flags to @flags after
411 * ntfs_remount() returns successfully (i.e. returns 0). Otherwise,
412 * @sb->s_flags are not changed.
413 */
414static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
415{
416 ntfs_volume *vol = NTFS_SB(sb);
417
418 ntfs_debug("Entering with remount options string: %s", opt);
419#ifndef NTFS_RW
420 /* For read-only compiled driver, enforce all read-only flags. */
421 *flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
422#else /* NTFS_RW */
423 /*
424 * For the read-write compiled driver, if we are remounting read-write,
425 * make sure there are no volume errors and that no unsupported volume
426 * flags are set. Also, empty the logfile journal as it would become
427 * stale as soon as something is written to the volume and mark the
428 * volume dirty so that chkdsk is run if the volume is not umounted
429 * cleanly. Finally, mark the quotas out of date so Windows rescans
430 * the volume on boot and updates them.
431 *
432 * When remounting read-only, mark the volume clean if no volume errors
433 * have occured.
434 */
435 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
436 static const char *es = ". Cannot remount read-write.";
437
438 /* Remounting read-write. */
439 if (NVolErrors(vol)) {
440 ntfs_error(sb, "Volume has errors and is read-only%s",
441 es);
442 return -EROFS;
443 }
444 if (vol->vol_flags & VOLUME_IS_DIRTY) {
445 ntfs_error(sb, "Volume is dirty and read-only%s", es);
446 return -EROFS;
447 }
448 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
449 ntfs_error(sb, "Volume has unsupported flags set and "
450 "is read-only%s", es);
451 return -EROFS;
452 }
453 if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
454 ntfs_error(sb, "Failed to set dirty bit in volume "
455 "information flags%s", es);
456 return -EROFS;
457 }
458#if 0
459 // TODO: Enable this code once we start modifying anything that
460 // is different between NTFS 1.2 and 3.x...
461 /* Set NT4 compatibility flag on newer NTFS version volumes. */
462 if ((vol->major_ver > 1)) {
463 if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
464 ntfs_error(sb, "Failed to set NT4 "
465 "compatibility flag%s", es);
466 NVolSetErrors(vol);
467 return -EROFS;
468 }
469 }
470#endif
471 if (!ntfs_empty_logfile(vol->logfile_ino)) {
472 ntfs_error(sb, "Failed to empty journal $LogFile%s",
473 es);
474 NVolSetErrors(vol);
475 return -EROFS;
476 }
477 if (!ntfs_mark_quotas_out_of_date(vol)) {
478 ntfs_error(sb, "Failed to mark quotas out of date%s",
479 es);
480 NVolSetErrors(vol);
481 return -EROFS;
482 }
483 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
484 /* Remounting read-only. */
485 if (!NVolErrors(vol)) {
486 if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
487 ntfs_warning(sb, "Failed to clear dirty bit "
488 "in volume information "
489 "flags. Run chkdsk.");
490 }
491 }
492#endif /* NTFS_RW */
493
494 // TODO: Deal with *flags.
495
496 if (!parse_options(vol, opt))
497 return -EINVAL;
498 ntfs_debug("Done.");
499 return 0;
500}
501
502/**
503 * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector
504 * @sb: Super block of the device to which @b belongs.
505 * @b: Boot sector of device @sb to check.
506 * @silent: If TRUE, all output will be silenced.
507 *
508 * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot
509 * sector. Returns TRUE if it is valid and FALSE if not.
510 *
511 * @sb is only needed for warning/error output, i.e. it can be NULL when silent
512 * is TRUE.
513 */
514static BOOL is_boot_sector_ntfs(const struct super_block *sb,
515 const NTFS_BOOT_SECTOR *b, const BOOL silent)
516{
517 /*
518 * Check that checksum == sum of u32 values from b to the checksum
519 * field. If checksum is zero, no checking is done.
520 */
521 if ((void*)b < (void*)&b->checksum && b->checksum) {
522 le32 *u;
523 u32 i;
524
525 for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u)
526 i += le32_to_cpup(u);
527 if (le32_to_cpu(b->checksum) != i)
528 goto not_ntfs;
529 }
530 /* Check OEMidentifier is "NTFS " */
531 if (b->oem_id != magicNTFS)
532 goto not_ntfs;
533 /* Check bytes per sector value is between 256 and 4096. */
534 if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 ||
535 le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000)
536 goto not_ntfs;
537 /* Check sectors per cluster value is valid. */
538 switch (b->bpb.sectors_per_cluster) {
539 case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128:
540 break;
541 default:
542 goto not_ntfs;
543 }
544 /* Check the cluster size is not above 65536 bytes. */
545 if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) *
546 b->bpb.sectors_per_cluster > 0x10000)
547 goto not_ntfs;
548 /* Check reserved/unused fields are really zero. */
549 if (le16_to_cpu(b->bpb.reserved_sectors) ||
550 le16_to_cpu(b->bpb.root_entries) ||
551 le16_to_cpu(b->bpb.sectors) ||
552 le16_to_cpu(b->bpb.sectors_per_fat) ||
553 le32_to_cpu(b->bpb.large_sectors) || b->bpb.fats)
554 goto not_ntfs;
555 /* Check clusters per file mft record value is valid. */
556 if ((u8)b->clusters_per_mft_record < 0xe1 ||
557 (u8)b->clusters_per_mft_record > 0xf7)
558 switch (b->clusters_per_mft_record) {
559 case 1: case 2: case 4: case 8: case 16: case 32: case 64:
560 break;
561 default:
562 goto not_ntfs;
563 }
564 /* Check clusters per index block value is valid. */
565 if ((u8)b->clusters_per_index_record < 0xe1 ||
566 (u8)b->clusters_per_index_record > 0xf7)
567 switch (b->clusters_per_index_record) {
568 case 1: case 2: case 4: case 8: case 16: case 32: case 64:
569 break;
570 default:
571 goto not_ntfs;
572 }
573 /*
574 * Check for valid end of sector marker. We will work without it, but
575 * many BIOSes will refuse to boot from a bootsector if the magic is
576 * incorrect, so we emit a warning.
577 */
578 if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
579 ntfs_warning(sb, "Invalid end of sector marker.");
580 return TRUE;
581not_ntfs:
582 return FALSE;
583}
584
585/**
586 * read_ntfs_boot_sector - read the NTFS boot sector of a device
587 * @sb: super block of device to read the boot sector from
588 * @silent: if true, suppress all output
589 *
590 * Reads the boot sector from the device and validates it. If that fails, tries
591 * to read the backup boot sector, first from the end of the device a-la NT4 and
592 * later and then from the middle of the device a-la NT3.51 and before.
593 *
594 * If a valid boot sector is found but it is not the primary boot sector, we
595 * repair the primary boot sector silently (unless the device is read-only or
596 * the primary boot sector is not accessible).
597 *
598 * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super
599 * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized
600 * to their respective values.
601 *
602 * Return the unlocked buffer head containing the boot sector or NULL on error.
603 */
604static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb,
605 const int silent)
606{
607 const char *read_err_str = "Unable to read %s boot sector.";
608 struct buffer_head *bh_primary, *bh_backup;
609 long nr_blocks = NTFS_SB(sb)->nr_blocks;
610
611 /* Try to read primary boot sector. */
612 if ((bh_primary = sb_bread(sb, 0))) {
613 if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
614 bh_primary->b_data, silent))
615 return bh_primary;
616 if (!silent)
617 ntfs_error(sb, "Primary boot sector is invalid.");
618 } else if (!silent)
619 ntfs_error(sb, read_err_str, "primary");
620 if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) {
621 if (bh_primary)
622 brelse(bh_primary);
623 if (!silent)
624 ntfs_error(sb, "Mount option errors=recover not used. "
625 "Aborting without trying to recover.");
626 return NULL;
627 }
628 /* Try to read NT4+ backup boot sector. */
629 if ((bh_backup = sb_bread(sb, nr_blocks - 1))) {
630 if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
631 bh_backup->b_data, silent))
632 goto hotfix_primary_boot_sector;
633 brelse(bh_backup);
634 } else if (!silent)
635 ntfs_error(sb, read_err_str, "backup");
636 /* Try to read NT3.51- backup boot sector. */
637 if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) {
638 if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
639 bh_backup->b_data, silent))
640 goto hotfix_primary_boot_sector;
641 if (!silent)
642 ntfs_error(sb, "Could not find a valid backup boot "
643 "sector.");
644 brelse(bh_backup);
645 } else if (!silent)
646 ntfs_error(sb, read_err_str, "backup");
647 /* We failed. Cleanup and return. */
648 if (bh_primary)
649 brelse(bh_primary);
650 return NULL;
651hotfix_primary_boot_sector:
652 if (bh_primary) {
653 /*
654 * If we managed to read sector zero and the volume is not
655 * read-only, copy the found, valid backup boot sector to the
656 * primary boot sector.
657 */
658 if (!(sb->s_flags & MS_RDONLY)) {
659 ntfs_warning(sb, "Hot-fix: Recovering invalid primary "
660 "boot sector from backup copy.");
661 memcpy(bh_primary->b_data, bh_backup->b_data,
662 sb->s_blocksize);
663 mark_buffer_dirty(bh_primary);
664 sync_dirty_buffer(bh_primary);
665 if (buffer_uptodate(bh_primary)) {
666 brelse(bh_backup);
667 return bh_primary;
668 }
669 ntfs_error(sb, "Hot-fix: Device write error while "
670 "recovering primary boot sector.");
671 } else {
672 ntfs_warning(sb, "Hot-fix: Recovery of primary boot "
673 "sector failed: Read-only mount.");
674 }
675 brelse(bh_primary);
676 }
677 ntfs_warning(sb, "Using backup boot sector.");
678 return bh_backup;
679}
680
681/**
682 * parse_ntfs_boot_sector - parse the boot sector and store the data in @vol
683 * @vol: volume structure to initialise with data from boot sector
684 * @b: boot sector to parse
685 *
686 * Parse the ntfs boot sector @b and store all imporant information therein in
687 * the ntfs super block @vol. Return TRUE on success and FALSE on error.
688 */
689static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
690{
691 unsigned int sectors_per_cluster_bits, nr_hidden_sects;
692 int clusters_per_mft_record, clusters_per_index_record;
693 s64 ll;
694
695 vol->sector_size = le16_to_cpu(b->bpb.bytes_per_sector);
696 vol->sector_size_bits = ffs(vol->sector_size) - 1;
697 ntfs_debug("vol->sector_size = %i (0x%x)", vol->sector_size,
698 vol->sector_size);
699 ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits,
700 vol->sector_size_bits);
701 if (vol->sector_size != vol->sb->s_blocksize)
702 ntfs_warning(vol->sb, "The boot sector indicates a sector size "
703 "different from the device sector size.");
704 ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster);
705 sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1;
706 ntfs_debug("sectors_per_cluster_bits = 0x%x",
707 sectors_per_cluster_bits);
708 nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors);
709 ntfs_debug("number of hidden sectors = 0x%x", nr_hidden_sects);
710 vol->cluster_size = vol->sector_size << sectors_per_cluster_bits;
711 vol->cluster_size_mask = vol->cluster_size - 1;
712 vol->cluster_size_bits = ffs(vol->cluster_size) - 1;
713 ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size,
714 vol->cluster_size);
715 ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask);
716 ntfs_debug("vol->cluster_size_bits = %i (0x%x)",
717 vol->cluster_size_bits, vol->cluster_size_bits);
718 if (vol->sector_size > vol->cluster_size) {
719 ntfs_error(vol->sb, "Sector sizes above the cluster size are "
720 "not supported. Sorry.");
721 return FALSE;
722 }
723 if (vol->sb->s_blocksize > vol->cluster_size) {
724 ntfs_error(vol->sb, "Cluster sizes smaller than the device "
725 "sector size are not supported. Sorry.");
726 return FALSE;
727 }
728 clusters_per_mft_record = b->clusters_per_mft_record;
729 ntfs_debug("clusters_per_mft_record = %i (0x%x)",
730 clusters_per_mft_record, clusters_per_mft_record);
731 if (clusters_per_mft_record > 0)
732 vol->mft_record_size = vol->cluster_size <<
733 (ffs(clusters_per_mft_record) - 1);
734 else
735 /*
736 * When mft_record_size < cluster_size, clusters_per_mft_record
737 * = -log2(mft_record_size) bytes. mft_record_size normaly is
738 * 1024 bytes, which is encoded as 0xF6 (-10 in decimal).
739 */
740 vol->mft_record_size = 1 << -clusters_per_mft_record;
741 vol->mft_record_size_mask = vol->mft_record_size - 1;
742 vol->mft_record_size_bits = ffs(vol->mft_record_size) - 1;
743 ntfs_debug("vol->mft_record_size = %i (0x%x)", vol->mft_record_size,
744 vol->mft_record_size);
745 ntfs_debug("vol->mft_record_size_mask = 0x%x",
746 vol->mft_record_size_mask);
747 ntfs_debug("vol->mft_record_size_bits = %i (0x%x)",
748 vol->mft_record_size_bits, vol->mft_record_size_bits);
749 /*
750 * We cannot support mft record sizes above the PAGE_CACHE_SIZE since
751 * we store $MFT/$DATA, the table of mft records in the page cache.
752 */
753 if (vol->mft_record_size > PAGE_CACHE_SIZE) {
754 ntfs_error(vol->sb, "Mft record size %i (0x%x) exceeds the "
755 "page cache size on your system %lu (0x%lx). "
756 "This is not supported. Sorry.",
757 vol->mft_record_size, vol->mft_record_size,
758 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE);
759 return FALSE;
760 }
761 clusters_per_index_record = b->clusters_per_index_record;
762 ntfs_debug("clusters_per_index_record = %i (0x%x)",
763 clusters_per_index_record, clusters_per_index_record);
764 if (clusters_per_index_record > 0)
765 vol->index_record_size = vol->cluster_size <<
766 (ffs(clusters_per_index_record) - 1);
767 else
768 /*
769 * When index_record_size < cluster_size,
770 * clusters_per_index_record = -log2(index_record_size) bytes.
771 * index_record_size normaly equals 4096 bytes, which is
772 * encoded as 0xF4 (-12 in decimal).
773 */
774 vol->index_record_size = 1 << -clusters_per_index_record;
775 vol->index_record_size_mask = vol->index_record_size - 1;
776 vol->index_record_size_bits = ffs(vol->index_record_size) - 1;
777 ntfs_debug("vol->index_record_size = %i (0x%x)",
778 vol->index_record_size, vol->index_record_size);
779 ntfs_debug("vol->index_record_size_mask = 0x%x",
780 vol->index_record_size_mask);
781 ntfs_debug("vol->index_record_size_bits = %i (0x%x)",
782 vol->index_record_size_bits,
783 vol->index_record_size_bits);
784 /*
785 * Get the size of the volume in clusters and check for 64-bit-ness.
786 * Windows currently only uses 32 bits to save the clusters so we do
787 * the same as it is much faster on 32-bit CPUs.
788 */
789 ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits;
790 if ((u64)ll >= 1ULL << 32) {
791 ntfs_error(vol->sb, "Cannot handle 64-bit clusters. Sorry.");
792 return FALSE;
793 }
794 vol->nr_clusters = ll;
795 ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters);
796 /*
797 * On an architecture where unsigned long is 32-bits, we restrict the
798 * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler
799 * will hopefully optimize the whole check away.
800 */
801 if (sizeof(unsigned long) < 8) {
802 if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) {
803 ntfs_error(vol->sb, "Volume size (%lluTiB) is too "
804 "large for this architecture. "
805 "Maximum supported is 2TiB. Sorry.",
806 (unsigned long long)ll >> (40 -
807 vol->cluster_size_bits));
808 return FALSE;
809 }
810 }
811 ll = sle64_to_cpu(b->mft_lcn);
812 if (ll >= vol->nr_clusters) {
813 ntfs_error(vol->sb, "MFT LCN is beyond end of volume. Weird.");
814 return FALSE;
815 }
816 vol->mft_lcn = ll;
817 ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn);
818 ll = sle64_to_cpu(b->mftmirr_lcn);
819 if (ll >= vol->nr_clusters) {
820 ntfs_error(vol->sb, "MFTMirr LCN is beyond end of volume. "
821 "Weird.");
822 return FALSE;
823 }
824 vol->mftmirr_lcn = ll;
825 ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn);
826#ifdef NTFS_RW
827 /*
828 * Work out the size of the mft mirror in number of mft records. If the
829 * cluster size is less than or equal to the size taken by four mft
830 * records, the mft mirror stores the first four mft records. If the
831 * cluster size is bigger than the size taken by four mft records, the
832 * mft mirror contains as many mft records as will fit into one
833 * cluster.
834 */
835 if (vol->cluster_size <= (4 << vol->mft_record_size_bits))
836 vol->mftmirr_size = 4;
837 else
838 vol->mftmirr_size = vol->cluster_size >>
839 vol->mft_record_size_bits;
840 ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size);
841#endif /* NTFS_RW */
842 vol->serial_no = le64_to_cpu(b->volume_serial_number);
843 ntfs_debug("vol->serial_no = 0x%llx",
844 (unsigned long long)vol->serial_no);
845 return TRUE;
846}
847
848/**
849 * ntfs_setup_allocators - initialize the cluster and mft allocators
850 * @vol: volume structure for which to setup the allocators
851 *
852 * Setup the cluster (lcn) and mft allocators to the starting values.
853 */
854static void ntfs_setup_allocators(ntfs_volume *vol)
855{
856#ifdef NTFS_RW
857 LCN mft_zone_size, mft_lcn;
858#endif /* NTFS_RW */
859
860 ntfs_debug("vol->mft_zone_multiplier = 0x%x",
861 vol->mft_zone_multiplier);
862#ifdef NTFS_RW
863 /* Determine the size of the MFT zone. */
864 mft_zone_size = vol->nr_clusters;
865 switch (vol->mft_zone_multiplier) { /* % of volume size in clusters */
866 case 4:
867 mft_zone_size >>= 1; /* 50% */
868 break;
869 case 3:
870 mft_zone_size = (mft_zone_size +
871 (mft_zone_size >> 1)) >> 2; /* 37.5% */
872 break;
873 case 2:
874 mft_zone_size >>= 2; /* 25% */
875 break;
876 /* case 1: */
877 default:
878 mft_zone_size >>= 3; /* 12.5% */
879 break;
880 }
881 /* Setup the mft zone. */
882 vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn;
883 ntfs_debug("vol->mft_zone_pos = 0x%llx",
884 (unsigned long long)vol->mft_zone_pos);
885 /*
886 * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs
887 * source) and if the actual mft_lcn is in the expected place or even
888 * further to the front of the volume, extend the mft_zone to cover the
889 * beginning of the volume as well. This is in order to protect the
890 * area reserved for the mft bitmap as well within the mft_zone itself.
891 * On non-standard volumes we do not protect it as the overhead would
892 * be higher than the speed increase we would get by doing it.
893 */
894 mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size;
895 if (mft_lcn * vol->cluster_size < 16 * 1024)
896 mft_lcn = (16 * 1024 + vol->cluster_size - 1) /
897 vol->cluster_size;
898 if (vol->mft_zone_start <= mft_lcn)
899 vol->mft_zone_start = 0;
900 ntfs_debug("vol->mft_zone_start = 0x%llx",
901 (unsigned long long)vol->mft_zone_start);
902 /*
903 * Need to cap the mft zone on non-standard volumes so that it does
904 * not point outside the boundaries of the volume. We do this by
905 * halving the zone size until we are inside the volume.
906 */
907 vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
908 while (vol->mft_zone_end >= vol->nr_clusters) {
909 mft_zone_size >>= 1;
910 vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
911 }
912 ntfs_debug("vol->mft_zone_end = 0x%llx",
913 (unsigned long long)vol->mft_zone_end);
914 /*
915 * Set the current position within each data zone to the start of the
916 * respective zone.
917 */
918 vol->data1_zone_pos = vol->mft_zone_end;
919 ntfs_debug("vol->data1_zone_pos = 0x%llx",
920 (unsigned long long)vol->data1_zone_pos);
921 vol->data2_zone_pos = 0;
922 ntfs_debug("vol->data2_zone_pos = 0x%llx",
923 (unsigned long long)vol->data2_zone_pos);
924
925 /* Set the mft data allocation position to mft record 24. */
926 vol->mft_data_pos = 24;
927 ntfs_debug("vol->mft_data_pos = 0x%llx",
928 (unsigned long long)vol->mft_data_pos);
929#endif /* NTFS_RW */
930}
931
932#ifdef NTFS_RW
933
934/**
935 * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume
936 * @vol: ntfs super block describing device whose mft mirror to load
937 *
938 * Return TRUE on success or FALSE on error.
939 */
940static BOOL load_and_init_mft_mirror(ntfs_volume *vol)
941{
942 struct inode *tmp_ino;
943 ntfs_inode *tmp_ni;
944
945 ntfs_debug("Entering.");
946 /* Get mft mirror inode. */
947 tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr);
948 if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
949 if (!IS_ERR(tmp_ino))
950 iput(tmp_ino);
951 /* Caller will display error message. */
952 return FALSE;
953 }
954 /*
955 * Re-initialize some specifics about $MFTMirr's inode as
956 * ntfs_read_inode() will have set up the default ones.
957 */
958 /* Set uid and gid to root. */
959 tmp_ino->i_uid = tmp_ino->i_gid = 0;
960 /* Regular file. No access for anyone. */
961 tmp_ino->i_mode = S_IFREG;
962 /* No VFS initiated operations allowed for $MFTMirr. */
963 tmp_ino->i_op = &ntfs_empty_inode_ops;
964 tmp_ino->i_fop = &ntfs_empty_file_ops;
965 /* Put in our special address space operations. */
966 tmp_ino->i_mapping->a_ops = &ntfs_mst_aops;
967 tmp_ni = NTFS_I(tmp_ino);
968 /* The $MFTMirr, like the $MFT is multi sector transfer protected. */
969 NInoSetMstProtected(tmp_ni);
970 /*
971 * Set up our little cheat allowing us to reuse the async read io
972 * completion handler for directories.
973 */
974 tmp_ni->itype.index.block_size = vol->mft_record_size;
975 tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits;
976 vol->mftmirr_ino = tmp_ino;
977 ntfs_debug("Done.");
978 return TRUE;
979}
980
981/**
982 * check_mft_mirror - compare contents of the mft mirror with the mft
983 * @vol: ntfs super block describing device whose mft mirror to check
984 *
985 * Return TRUE on success or FALSE on error.
986 *
987 * Note, this function also results in the mft mirror runlist being completely
988 * mapped into memory. The mft mirror write code requires this and will BUG()
989 * should it find an unmapped runlist element.
990 */
991static BOOL check_mft_mirror(ntfs_volume *vol)
992{
993 unsigned long index;
994 struct super_block *sb = vol->sb;
995 ntfs_inode *mirr_ni;
996 struct page *mft_page, *mirr_page;
997 u8 *kmft, *kmirr;
998 runlist_element *rl, rl2[2];
999 int mrecs_per_page, i;
1000
1001 ntfs_debug("Entering.");
1002 /* Compare contents of $MFT and $MFTMirr. */
1003 mrecs_per_page = PAGE_CACHE_SIZE / vol->mft_record_size;
1004 BUG_ON(!mrecs_per_page);
1005 BUG_ON(!vol->mftmirr_size);
1006 mft_page = mirr_page = NULL;
1007 kmft = kmirr = NULL;
1008 index = i = 0;
1009 do {
1010 u32 bytes;
1011
1012 /* Switch pages if necessary. */
1013 if (!(i % mrecs_per_page)) {
1014 if (index) {
1015 ntfs_unmap_page(mft_page);
1016 ntfs_unmap_page(mirr_page);
1017 }
1018 /* Get the $MFT page. */
1019 mft_page = ntfs_map_page(vol->mft_ino->i_mapping,
1020 index);
1021 if (IS_ERR(mft_page)) {
1022 ntfs_error(sb, "Failed to read $MFT.");
1023 return FALSE;
1024 }
1025 kmft = page_address(mft_page);
1026 /* Get the $MFTMirr page. */
1027 mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping,
1028 index);
1029 if (IS_ERR(mirr_page)) {
1030 ntfs_error(sb, "Failed to read $MFTMirr.");
1031 goto mft_unmap_out;
1032 }
1033 kmirr = page_address(mirr_page);
1034 ++index;
1035 }
1036 /* Make sure the record is ok. */
1037 if (ntfs_is_baad_recordp((le32*)kmft)) {
1038 ntfs_error(sb, "Incomplete multi sector transfer "
1039 "detected in mft record %i.", i);
1040mm_unmap_out:
1041 ntfs_unmap_page(mirr_page);
1042mft_unmap_out:
1043 ntfs_unmap_page(mft_page);
1044 return FALSE;
1045 }
1046 if (ntfs_is_baad_recordp((le32*)kmirr)) {
1047 ntfs_error(sb, "Incomplete multi sector transfer "
1048 "detected in mft mirror record %i.", i);
1049 goto mm_unmap_out;
1050 }
1051 /* Get the amount of data in the current record. */
1052 bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use);
1053 if (!bytes || bytes > vol->mft_record_size) {
1054 bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use);
1055 if (!bytes || bytes > vol->mft_record_size)
1056 bytes = vol->mft_record_size;
1057 }
1058 /* Compare the two records. */
1059 if (memcmp(kmft, kmirr, bytes)) {
1060 ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not "
1061 "match. Run ntfsfix or chkdsk.", i);
1062 goto mm_unmap_out;
1063 }
1064 kmft += vol->mft_record_size;
1065 kmirr += vol->mft_record_size;
1066 } while (++i < vol->mftmirr_size);
1067 /* Release the last pages. */
1068 ntfs_unmap_page(mft_page);
1069 ntfs_unmap_page(mirr_page);
1070
1071 /* Construct the mft mirror runlist by hand. */
1072 rl2[0].vcn = 0;
1073 rl2[0].lcn = vol->mftmirr_lcn;
1074 rl2[0].length = (vol->mftmirr_size * vol->mft_record_size +
1075 vol->cluster_size - 1) / vol->cluster_size;
1076 rl2[1].vcn = rl2[0].length;
1077 rl2[1].lcn = LCN_ENOENT;
1078 rl2[1].length = 0;
1079 /*
1080 * Because we have just read all of the mft mirror, we know we have
1081 * mapped the full runlist for it.
1082 */
1083 mirr_ni = NTFS_I(vol->mftmirr_ino);
1084 down_read(&mirr_ni->runlist.lock);
1085 rl = mirr_ni->runlist.rl;
1086 /* Compare the two runlists. They must be identical. */
1087 i = 0;
1088 do {
1089 if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn ||
1090 rl2[i].length != rl[i].length) {
1091 ntfs_error(sb, "$MFTMirr location mismatch. "
1092 "Run chkdsk.");
1093 up_read(&mirr_ni->runlist.lock);
1094 return FALSE;
1095 }
1096 } while (rl2[i++].length);
1097 up_read(&mirr_ni->runlist.lock);
1098 ntfs_debug("Done.");
1099 return TRUE;
1100}
1101
1102/**
1103 * load_and_check_logfile - load and check the logfile inode for a volume
1104 * @vol: ntfs super block describing device whose logfile to load
1105 *
1106 * Return TRUE on success or FALSE on error.
1107 */
1108static BOOL load_and_check_logfile(ntfs_volume *vol)
1109{
1110 struct inode *tmp_ino;
1111
1112 ntfs_debug("Entering.");
1113 tmp_ino = ntfs_iget(vol->sb, FILE_LogFile);
1114 if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
1115 if (!IS_ERR(tmp_ino))
1116 iput(tmp_ino);
1117 /* Caller will display error message. */
1118 return FALSE;
1119 }
1120 if (!ntfs_check_logfile(tmp_ino)) {
1121 iput(tmp_ino);
1122 /* ntfs_check_logfile() will have displayed error output. */
1123 return FALSE;
1124 }
1125 vol->logfile_ino = tmp_ino;
1126 ntfs_debug("Done.");
1127 return TRUE;
1128}
1129
1130/**
1131 * load_and_init_quota - load and setup the quota file for a volume if present
1132 * @vol: ntfs super block describing device whose quota file to load
1133 *
1134 * Return TRUE on success or FALSE on error. If $Quota is not present, we
1135 * leave vol->quota_ino as NULL and return success.
1136 */
1137static BOOL load_and_init_quota(ntfs_volume *vol)
1138{
1139 MFT_REF mref;
1140 struct inode *tmp_ino;
1141 ntfs_name *name = NULL;
1142 static const ntfschar Quota[7] = { const_cpu_to_le16('$'),
1143 const_cpu_to_le16('Q'), const_cpu_to_le16('u'),
1144 const_cpu_to_le16('o'), const_cpu_to_le16('t'),
1145 const_cpu_to_le16('a'), 0 };
1146 static ntfschar Q[3] = { const_cpu_to_le16('$'),
1147 const_cpu_to_le16('Q'), 0 };
1148
1149 ntfs_debug("Entering.");
1150 /*
1151 * Find the inode number for the quota file by looking up the filename
1152 * $Quota in the extended system files directory $Extend.
1153 */
1154 down(&vol->extend_ino->i_sem);
1155 mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
1156 &name);
1157 up(&vol->extend_ino->i_sem);
1158 if (IS_ERR_MREF(mref)) {
1159 /*
1160 * If the file does not exist, quotas are disabled and have
1161 * never been enabled on this volume, just return success.
1162 */
1163 if (MREF_ERR(mref) == -ENOENT) {
1164 ntfs_debug("$Quota not present. Volume does not have "
1165 "quotas enabled.");
1166 /*
1167 * No need to try to set quotas out of date if they are
1168 * not enabled.
1169 */
1170 NVolSetQuotaOutOfDate(vol);
1171 return TRUE;
1172 }
1173 /* A real error occured. */
1174 ntfs_error(vol->sb, "Failed to find inode number for $Quota.");
1175 return FALSE;
1176 }
1177 /* We do not care for the type of match that was found. */
1178 if (name)
1179 kfree(name);
1180 /* Get the inode. */
1181 tmp_ino = ntfs_iget(vol->sb, MREF(mref));
1182 if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
1183 if (!IS_ERR(tmp_ino))
1184 iput(tmp_ino);
1185 ntfs_error(vol->sb, "Failed to load $Quota.");
1186 return FALSE;
1187 }
1188 vol->quota_ino = tmp_ino;
1189 /* Get the $Q index allocation attribute. */
1190 tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2);
1191 if (IS_ERR(tmp_ino)) {
1192 ntfs_error(vol->sb, "Failed to load $Quota/$Q index.");
1193 return FALSE;
1194 }
1195 vol->quota_q_ino = tmp_ino;
1196 ntfs_debug("Done.");
1197 return TRUE;
1198}
1199
1200/**
1201 * load_and_init_attrdef - load the attribute definitions table for a volume
1202 * @vol: ntfs super block describing device whose attrdef to load
1203 *
1204 * Return TRUE on success or FALSE on error.
1205 */
1206static BOOL load_and_init_attrdef(ntfs_volume *vol)
1207{
1208 struct super_block *sb = vol->sb;
1209 struct inode *ino;
1210 struct page *page;
1211 unsigned long index, max_index;
1212 unsigned int size;
1213
1214 ntfs_debug("Entering.");
1215 /* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */
1216 ino = ntfs_iget(sb, FILE_AttrDef);
1217 if (IS_ERR(ino) || is_bad_inode(ino)) {
1218 if (!IS_ERR(ino))
1219 iput(ino);
1220 goto failed;
1221 }
1222 /* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */
1223 if (!ino->i_size || ino->i_size > 0x7fffffff)
1224 goto iput_failed;
1225 vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(ino->i_size);
1226 if (!vol->attrdef)
1227 goto iput_failed;
1228 index = 0;
1229 max_index = ino->i_size >> PAGE_CACHE_SHIFT;
1230 size = PAGE_CACHE_SIZE;
1231 while (index < max_index) {
1232 /* Read the attrdef table and copy it into the linear buffer. */
1233read_partial_attrdef_page:
1234 page = ntfs_map_page(ino->i_mapping, index);
1235 if (IS_ERR(page))
1236 goto free_iput_failed;
1237 memcpy((u8*)vol->attrdef + (index++ << PAGE_CACHE_SHIFT),
1238 page_address(page), size);
1239 ntfs_unmap_page(page);
1240 };
1241 if (size == PAGE_CACHE_SIZE) {
1242 size = ino->i_size & ~PAGE_CACHE_MASK;
1243 if (size)
1244 goto read_partial_attrdef_page;
1245 }
1246 vol->attrdef_size = ino->i_size;
1247 ntfs_debug("Read %llu bytes from $AttrDef.", ino->i_size);
1248 iput(ino);
1249 return TRUE;
1250free_iput_failed:
1251 ntfs_free(vol->attrdef);
1252 vol->attrdef = NULL;
1253iput_failed:
1254 iput(ino);
1255failed:
1256 ntfs_error(sb, "Failed to initialize attribute definition table.");
1257 return FALSE;
1258}
1259
1260#endif /* NTFS_RW */
1261
1262/**
1263 * load_and_init_upcase - load the upcase table for an ntfs volume
1264 * @vol: ntfs super block describing device whose upcase to load
1265 *
1266 * Return TRUE on success or FALSE on error.
1267 */
1268static BOOL load_and_init_upcase(ntfs_volume *vol)
1269{
1270 struct super_block *sb = vol->sb;
1271 struct inode *ino;
1272 struct page *page;
1273 unsigned long index, max_index;
1274 unsigned int size;
1275 int i, max;
1276
1277 ntfs_debug("Entering.");
1278 /* Read upcase table and setup vol->upcase and vol->upcase_len. */
1279 ino = ntfs_iget(sb, FILE_UpCase);
1280 if (IS_ERR(ino) || is_bad_inode(ino)) {
1281 if (!IS_ERR(ino))
1282 iput(ino);
1283 goto upcase_failed;
1284 }
1285 /*
1286 * The upcase size must not be above 64k Unicode characters, must not
1287 * be zero and must be a multiple of sizeof(ntfschar).
1288 */
1289 if (!ino->i_size || ino->i_size & (sizeof(ntfschar) - 1) ||
1290 ino->i_size > 64ULL * 1024 * sizeof(ntfschar))
1291 goto iput_upcase_failed;
1292 vol->upcase = (ntfschar*)ntfs_malloc_nofs(ino->i_size);
1293 if (!vol->upcase)
1294 goto iput_upcase_failed;
1295 index = 0;
1296 max_index = ino->i_size >> PAGE_CACHE_SHIFT;
1297 size = PAGE_CACHE_SIZE;
1298 while (index < max_index) {
1299 /* Read the upcase table and copy it into the linear buffer. */
1300read_partial_upcase_page:
1301 page = ntfs_map_page(ino->i_mapping, index);
1302 if (IS_ERR(page))
1303 goto iput_upcase_failed;
1304 memcpy((char*)vol->upcase + (index++ << PAGE_CACHE_SHIFT),
1305 page_address(page), size);
1306 ntfs_unmap_page(page);
1307 };
1308 if (size == PAGE_CACHE_SIZE) {
1309 size = ino->i_size & ~PAGE_CACHE_MASK;
1310 if (size)
1311 goto read_partial_upcase_page;
1312 }
1313 vol->upcase_len = ino->i_size >> UCHAR_T_SIZE_BITS;
1314 ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).",
1315 ino->i_size, 64 * 1024 * sizeof(ntfschar));
1316 iput(ino);
1317 down(&ntfs_lock);
1318 if (!default_upcase) {
1319 ntfs_debug("Using volume specified $UpCase since default is "
1320 "not present.");
1321 up(&ntfs_lock);
1322 return TRUE;
1323 }
1324 max = default_upcase_len;
1325 if (max > vol->upcase_len)
1326 max = vol->upcase_len;
1327 for (i = 0; i < max; i++)
1328 if (vol->upcase[i] != default_upcase[i])
1329 break;
1330 if (i == max) {
1331 ntfs_free(vol->upcase);
1332 vol->upcase = default_upcase;
1333 vol->upcase_len = max;
1334 ntfs_nr_upcase_users++;
1335 up(&ntfs_lock);
1336 ntfs_debug("Volume specified $UpCase matches default. Using "
1337 "default.");
1338 return TRUE;
1339 }
1340 up(&ntfs_lock);
1341 ntfs_debug("Using volume specified $UpCase since it does not match "
1342 "the default.");
1343 return TRUE;
1344iput_upcase_failed:
1345 iput(ino);
1346 ntfs_free(vol->upcase);
1347 vol->upcase = NULL;
1348upcase_failed:
1349 down(&ntfs_lock);
1350 if (default_upcase) {
1351 vol->upcase = default_upcase;
1352 vol->upcase_len = default_upcase_len;
1353 ntfs_nr_upcase_users++;
1354 up(&ntfs_lock);
1355 ntfs_error(sb, "Failed to load $UpCase from the volume. Using "
1356 "default.");
1357 return TRUE;
1358 }
1359 up(&ntfs_lock);
1360 ntfs_error(sb, "Failed to initialize upcase table.");
1361 return FALSE;
1362}
1363
1364/**
1365 * load_system_files - open the system files using normal functions
1366 * @vol: ntfs super block describing device whose system files to load
1367 *
1368 * Open the system files with normal access functions and complete setting up
1369 * the ntfs super block @vol.
1370 *
1371 * Return TRUE on success or FALSE on error.
1372 */
1373static BOOL load_system_files(ntfs_volume *vol)
1374{
1375 struct super_block *sb = vol->sb;
1376 MFT_RECORD *m;
1377 VOLUME_INFORMATION *vi;
1378 ntfs_attr_search_ctx *ctx;
1379
1380 ntfs_debug("Entering.");
1381#ifdef NTFS_RW
1382 /* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */
1383 if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) {
1384 static const char *es1 = "Failed to load $MFTMirr";
1385 static const char *es2 = "$MFTMirr does not match $MFT";
1386 static const char *es3 = ". Run ntfsfix and/or chkdsk.";
1387
1388 /* If a read-write mount, convert it to a read-only mount. */
1389 if (!(sb->s_flags & MS_RDONLY)) {
1390 if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
1391 ON_ERRORS_CONTINUE))) {
1392 ntfs_error(sb, "%s and neither on_errors="
1393 "continue nor on_errors="
1394 "remount-ro was specified%s",
1395 !vol->mftmirr_ino ? es1 : es2,
1396 es3);
1397 goto iput_mirr_err_out;
1398 }
1399 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
1400 ntfs_error(sb, "%s. Mounting read-only%s",
1401 !vol->mftmirr_ino ? es1 : es2, es3);
1402 } else
1403 ntfs_warning(sb, "%s. Will not be able to remount "
1404 "read-write%s",
1405 !vol->mftmirr_ino ? es1 : es2, es3);
1406 /* This will prevent a read-write remount. */
1407 NVolSetErrors(vol);
1408 }
1409#endif /* NTFS_RW */
1410 /* Get mft bitmap attribute inode. */
1411 vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0);
1412 if (IS_ERR(vol->mftbmp_ino)) {
1413 ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute.");
1414 goto iput_mirr_err_out;
1415 }
1416 /* Read upcase table and setup @vol->upcase and @vol->upcase_len. */
1417 if (!load_and_init_upcase(vol))
1418 goto iput_mftbmp_err_out;
1419#ifdef NTFS_RW
1420 /*
1421 * Read attribute definitions table and setup @vol->attrdef and
1422 * @vol->attrdef_size.
1423 */
1424 if (!load_and_init_attrdef(vol))
1425 goto iput_upcase_err_out;
1426#endif /* NTFS_RW */
1427 /*
1428 * Get the cluster allocation bitmap inode and verify the size, no
1429 * need for any locking at this stage as we are already running
1430 * exclusively as we are mount in progress task.
1431 */
1432 vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap);
1433 if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) {
1434 if (!IS_ERR(vol->lcnbmp_ino))
1435 iput(vol->lcnbmp_ino);
1436 goto bitmap_failed;
1437 }
1438 if ((vol->nr_clusters + 7) >> 3 > vol->lcnbmp_ino->i_size) {
1439 iput(vol->lcnbmp_ino);
1440bitmap_failed:
1441 ntfs_error(sb, "Failed to load $Bitmap.");
1442 goto iput_attrdef_err_out;
1443 }
1444 /*
1445 * Get the volume inode and setup our cache of the volume flags and
1446 * version.
1447 */
1448 vol->vol_ino = ntfs_iget(sb, FILE_Volume);
1449 if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) {
1450 if (!IS_ERR(vol->vol_ino))
1451 iput(vol->vol_ino);
1452volume_failed:
1453 ntfs_error(sb, "Failed to load $Volume.");
1454 goto iput_lcnbmp_err_out;
1455 }
1456 m = map_mft_record(NTFS_I(vol->vol_ino));
1457 if (IS_ERR(m)) {
1458iput_volume_failed:
1459 iput(vol->vol_ino);
1460 goto volume_failed;
1461 }
1462 if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) {
1463 ntfs_error(sb, "Failed to get attribute search context.");
1464 goto get_ctx_vol_failed;
1465 }
1466 if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
1467 ctx) || ctx->attr->non_resident || ctx->attr->flags) {
1468err_put_vol:
1469 ntfs_attr_put_search_ctx(ctx);
1470get_ctx_vol_failed:
1471 unmap_mft_record(NTFS_I(vol->vol_ino));
1472 goto iput_volume_failed;
1473 }
1474 vi = (VOLUME_INFORMATION*)((char*)ctx->attr +
1475 le16_to_cpu(ctx->attr->data.resident.value_offset));
1476 /* Some bounds checks. */
1477 if ((u8*)vi < (u8*)ctx->attr || (u8*)vi +
1478 le32_to_cpu(ctx->attr->data.resident.value_length) >
1479 (u8*)ctx->attr + le32_to_cpu(ctx->attr->length))
1480 goto err_put_vol;
1481 /* Copy the volume flags and version to the ntfs_volume structure. */
1482 vol->vol_flags = vi->flags;
1483 vol->major_ver = vi->major_ver;
1484 vol->minor_ver = vi->minor_ver;
1485 ntfs_attr_put_search_ctx(ctx);
1486 unmap_mft_record(NTFS_I(vol->vol_ino));
1487 printk(KERN_INFO "NTFS volume version %i.%i.\n", vol->major_ver,
1488 vol->minor_ver);
1489#ifdef NTFS_RW
1490 /* Make sure that no unsupported volume flags are set. */
1491 if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
1492 static const char *es1a = "Volume is dirty";
1493 static const char *es1b = "Volume has unsupported flags set";
1494 static const char *es2 = ". Run chkdsk and mount in Windows.";
1495 const char *es1;
1496
1497 es1 = vol->vol_flags & VOLUME_IS_DIRTY ? es1a : es1b;
1498 /* If a read-write mount, convert it to a read-only mount. */
1499 if (!(sb->s_flags & MS_RDONLY)) {
1500 if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
1501 ON_ERRORS_CONTINUE))) {
1502 ntfs_error(sb, "%s and neither on_errors="
1503 "continue nor on_errors="
1504 "remount-ro was specified%s",
1505 es1, es2);
1506 goto iput_vol_err_out;
1507 }
1508 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
1509 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
1510 } else
1511 ntfs_warning(sb, "%s. Will not be able to remount "
1512 "read-write%s", es1, es2);
1513 /*
1514 * Do not set NVolErrors() because ntfs_remount() re-checks the
1515 * flags which we need to do in case any flags have changed.
1516 */
1517 }
1518 /*
1519 * Get the inode for the logfile, check it and determine if the volume
1520 * was shutdown cleanly.
1521 */
1522 if (!load_and_check_logfile(vol) ||
1523 !ntfs_is_logfile_clean(vol->logfile_ino)) {
1524 static const char *es1a = "Failed to load $LogFile";
1525 static const char *es1b = "$LogFile is not clean";
1526 static const char *es2 = ". Mount in Windows.";
1527 const char *es1;
1528
1529 es1 = !vol->logfile_ino ? es1a : es1b;
1530 /* If a read-write mount, convert it to a read-only mount. */
1531 if (!(sb->s_flags & MS_RDONLY)) {
1532 if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
1533 ON_ERRORS_CONTINUE))) {
1534 ntfs_error(sb, "%s and neither on_errors="
1535 "continue nor on_errors="
1536 "remount-ro was specified%s",
1537 es1, es2);
1538 goto iput_logfile_err_out;
1539 }
1540 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
1541 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
1542 } else
1543 ntfs_warning(sb, "%s. Will not be able to remount "
1544 "read-write%s", es1, es2);
1545 /* This will prevent a read-write remount. */
1546 NVolSetErrors(vol);
1547 }
1548 /* If (still) a read-write mount, mark the volume dirty. */
1549 if (!(sb->s_flags & MS_RDONLY) &&
1550 ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
1551 static const char *es1 = "Failed to set dirty bit in volume "
1552 "information flags";
1553 static const char *es2 = ". Run chkdsk.";
1554
1555 /* Convert to a read-only mount. */
1556 if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
1557 ON_ERRORS_CONTINUE))) {
1558 ntfs_error(sb, "%s and neither on_errors=continue nor "
1559 "on_errors=remount-ro was specified%s",
1560 es1, es2);
1561 goto iput_logfile_err_out;
1562 }
1563 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
1564 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
1565 /*
1566 * Do not set NVolErrors() because ntfs_remount() might manage
1567 * to set the dirty flag in which case all would be well.
1568 */
1569 }
1570#if 0
1571 // TODO: Enable this code once we start modifying anything that is
1572 // different between NTFS 1.2 and 3.x...
1573 /*
1574 * If (still) a read-write mount, set the NT4 compatibility flag on
1575 * newer NTFS version volumes.
1576 */
1577 if (!(sb->s_flags & MS_RDONLY) && (vol->major_ver > 1) &&
1578 ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
1579 static const char *es1 = "Failed to set NT4 compatibility flag";
1580 static const char *es2 = ". Run chkdsk.";
1581
1582 /* Convert to a read-only mount. */
1583 if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
1584 ON_ERRORS_CONTINUE))) {
1585 ntfs_error(sb, "%s and neither on_errors=continue nor "
1586 "on_errors=remount-ro was specified%s",
1587 es1, es2);
1588 goto iput_logfile_err_out;
1589 }
1590 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
1591 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
1592 NVolSetErrors(vol);
1593 }
1594#endif
1595 /* If (still) a read-write mount, empty the logfile. */
1596 if (!(sb->s_flags & MS_RDONLY) &&
1597 !ntfs_empty_logfile(vol->logfile_ino)) {
1598 static const char *es1 = "Failed to empty $LogFile";
1599 static const char *es2 = ". Mount in Windows.";
1600
1601 /* Convert to a read-only mount. */
1602 if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
1603 ON_ERRORS_CONTINUE))) {
1604 ntfs_error(sb, "%s and neither on_errors=continue nor "
1605 "on_errors=remount-ro was specified%s",
1606 es1, es2);
1607 goto iput_logfile_err_out;
1608 }
1609 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
1610 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
1611 NVolSetErrors(vol);
1612 }
1613#endif /* NTFS_RW */
1614 /* Get the root directory inode. */
1615 vol->root_ino = ntfs_iget(sb, FILE_root);
1616 if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) {
1617 if (!IS_ERR(vol->root_ino))
1618 iput(vol->root_ino);
1619 ntfs_error(sb, "Failed to load root directory.");
1620 goto iput_logfile_err_out;
1621 }
1622 /* If on NTFS versions before 3.0, we are done. */
1623 if (vol->major_ver < 3)
1624 return TRUE;
1625 /* NTFS 3.0+ specific initialization. */
1626 /* Get the security descriptors inode. */
1627 vol->secure_ino = ntfs_iget(sb, FILE_Secure);
1628 if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) {
1629 if (!IS_ERR(vol->secure_ino))
1630 iput(vol->secure_ino);
1631 ntfs_error(sb, "Failed to load $Secure.");
1632 goto iput_root_err_out;
1633 }
1634 // FIXME: Initialize security.
1635 /* Get the extended system files' directory inode. */
1636 vol->extend_ino = ntfs_iget(sb, FILE_Extend);
1637 if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) {
1638 if (!IS_ERR(vol->extend_ino))
1639 iput(vol->extend_ino);
1640 ntfs_error(sb, "Failed to load $Extend.");
1641 goto iput_sec_err_out;
1642 }
1643#ifdef NTFS_RW
1644 /* Find the quota file, load it if present, and set it up. */
1645 if (!load_and_init_quota(vol)) {
1646 static const char *es1 = "Failed to load $Quota";
1647 static const char *es2 = ". Run chkdsk.";
1648
1649 /* If a read-write mount, convert it to a read-only mount. */
1650 if (!(sb->s_flags & MS_RDONLY)) {
1651 if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
1652 ON_ERRORS_CONTINUE))) {
1653 ntfs_error(sb, "%s and neither on_errors="
1654 "continue nor on_errors="
1655 "remount-ro was specified%s",
1656 es1, es2);
1657 goto iput_quota_err_out;
1658 }
1659 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
1660 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
1661 } else
1662 ntfs_warning(sb, "%s. Will not be able to remount "
1663 "read-write%s", es1, es2);
1664 /* This will prevent a read-write remount. */
1665 NVolSetErrors(vol);
1666 }
1667 /* If (still) a read-write mount, mark the quotas out of date. */
1668 if (!(sb->s_flags & MS_RDONLY) &&
1669 !ntfs_mark_quotas_out_of_date(vol)) {
1670 static const char *es1 = "Failed to mark quotas out of date";
1671 static const char *es2 = ". Run chkdsk.";
1672
1673 /* Convert to a read-only mount. */
1674 if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
1675 ON_ERRORS_CONTINUE))) {
1676 ntfs_error(sb, "%s and neither on_errors=continue nor "
1677 "on_errors=remount-ro was specified%s",
1678 es1, es2);
1679 goto iput_quota_err_out;
1680 }
1681 ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
1682 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
1683 NVolSetErrors(vol);
1684 }
1685 // TODO: Delete or checkpoint the $UsnJrnl if it exists.
1686#endif /* NTFS_RW */
1687 return TRUE;
1688#ifdef NTFS_RW
1689iput_quota_err_out:
1690 if (vol->quota_q_ino)
1691 iput(vol->quota_q_ino);
1692 if (vol->quota_ino)
1693 iput(vol->quota_ino);
1694 iput(vol->extend_ino);
1695#endif /* NTFS_RW */
1696iput_sec_err_out:
1697 iput(vol->secure_ino);
1698iput_root_err_out:
1699 iput(vol->root_ino);
1700iput_logfile_err_out:
1701#ifdef NTFS_RW
1702 if (vol->logfile_ino)
1703 iput(vol->logfile_ino);
1704iput_vol_err_out:
1705#endif /* NTFS_RW */
1706 iput(vol->vol_ino);
1707iput_lcnbmp_err_out:
1708 iput(vol->lcnbmp_ino);
1709iput_attrdef_err_out:
1710 vol->attrdef_size = 0;
1711 if (vol->attrdef) {
1712 ntfs_free(vol->attrdef);
1713 vol->attrdef = NULL;
1714 }
1715#ifdef NTFS_RW
1716iput_upcase_err_out:
1717#endif /* NTFS_RW */
1718 vol->upcase_len = 0;
1719 down(&ntfs_lock);
1720 if (vol->upcase == default_upcase) {
1721 ntfs_nr_upcase_users--;
1722 vol->upcase = NULL;
1723 }
1724 up(&ntfs_lock);
1725 if (vol->upcase) {
1726 ntfs_free(vol->upcase);
1727 vol->upcase = NULL;
1728 }
1729iput_mftbmp_err_out:
1730 iput(vol->mftbmp_ino);
1731iput_mirr_err_out:
1732#ifdef NTFS_RW
1733 if (vol->mftmirr_ino)
1734 iput(vol->mftmirr_ino);
1735#endif /* NTFS_RW */
1736 return FALSE;
1737}
1738
1739/**
1740 * ntfs_put_super - called by the vfs to unmount a volume
1741 * @sb: vfs superblock of volume to unmount
1742 *
1743 * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when
1744 * the volume is being unmounted (umount system call has been invoked) and it
1745 * releases all inodes and memory belonging to the NTFS specific part of the
1746 * super block.
1747 */
1748static void ntfs_put_super(struct super_block *sb)
1749{
1750 ntfs_volume *vol = NTFS_SB(sb);
1751
1752 ntfs_debug("Entering.");
1753#ifdef NTFS_RW
1754 /*
1755 * Commit all inodes while they are still open in case some of them
1756 * cause others to be dirtied.
1757 */
1758 ntfs_commit_inode(vol->vol_ino);
1759
1760 /* NTFS 3.0+ specific. */
1761 if (vol->major_ver >= 3) {
1762 if (vol->quota_q_ino)
1763 ntfs_commit_inode(vol->quota_q_ino);
1764 if (vol->quota_ino)
1765 ntfs_commit_inode(vol->quota_ino);
1766 if (vol->extend_ino)
1767 ntfs_commit_inode(vol->extend_ino);
1768 if (vol->secure_ino)
1769 ntfs_commit_inode(vol->secure_ino);
1770 }
1771
1772 ntfs_commit_inode(vol->root_ino);
1773
1774 down_write(&vol->lcnbmp_lock);
1775 ntfs_commit_inode(vol->lcnbmp_ino);
1776 up_write(&vol->lcnbmp_lock);
1777
1778 down_write(&vol->mftbmp_lock);
1779 ntfs_commit_inode(vol->mftbmp_ino);
1780 up_write(&vol->mftbmp_lock);
1781
1782 if (vol->logfile_ino)
1783 ntfs_commit_inode(vol->logfile_ino);
1784
1785 if (vol->mftmirr_ino)
1786 ntfs_commit_inode(vol->mftmirr_ino);
1787 ntfs_commit_inode(vol->mft_ino);
1788
1789 /*
1790 * If a read-write mount and no volume errors have occured, mark the
1791 * volume clean. Also, re-commit all affected inodes.
1792 */
1793 if (!(sb->s_flags & MS_RDONLY)) {
1794 if (!NVolErrors(vol)) {
1795 if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
1796 ntfs_warning(sb, "Failed to clear dirty bit "
1797 "in volume information "
1798 "flags. Run chkdsk.");
1799 ntfs_commit_inode(vol->vol_ino);
1800 ntfs_commit_inode(vol->root_ino);
1801 if (vol->mftmirr_ino)
1802 ntfs_commit_inode(vol->mftmirr_ino);
1803 ntfs_commit_inode(vol->mft_ino);
1804 } else {
1805 ntfs_warning(sb, "Volume has errors. Leaving volume "
1806 "marked dirty. Run chkdsk.");
1807 }
1808 }
1809#endif /* NTFS_RW */
1810
1811 iput(vol->vol_ino);
1812 vol->vol_ino = NULL;
1813
1814 /* NTFS 3.0+ specific clean up. */
1815 if (vol->major_ver >= 3) {
1816#ifdef NTFS_RW
1817 if (vol->quota_q_ino) {
1818 iput(vol->quota_q_ino);
1819 vol->quota_q_ino = NULL;
1820 }
1821 if (vol->quota_ino) {
1822 iput(vol->quota_ino);
1823 vol->quota_ino = NULL;
1824 }
1825#endif /* NTFS_RW */
1826 if (vol->extend_ino) {
1827 iput(vol->extend_ino);
1828 vol->extend_ino = NULL;
1829 }
1830 if (vol->secure_ino) {
1831 iput(vol->secure_ino);
1832 vol->secure_ino = NULL;
1833 }
1834 }
1835
1836 iput(vol->root_ino);
1837 vol->root_ino = NULL;
1838
1839 down_write(&vol->lcnbmp_lock);
1840 iput(vol->lcnbmp_ino);
1841 vol->lcnbmp_ino = NULL;
1842 up_write(&vol->lcnbmp_lock);
1843
1844 down_write(&vol->mftbmp_lock);
1845 iput(vol->mftbmp_ino);
1846 vol->mftbmp_ino = NULL;
1847 up_write(&vol->mftbmp_lock);
1848
1849#ifdef NTFS_RW
1850 if (vol->logfile_ino) {
1851 iput(vol->logfile_ino);
1852 vol->logfile_ino = NULL;
1853 }
1854 if (vol->mftmirr_ino) {
1855 /* Re-commit the mft mirror and mft just in case. */
1856 ntfs_commit_inode(vol->mftmirr_ino);
1857 ntfs_commit_inode(vol->mft_ino);
1858 iput(vol->mftmirr_ino);
1859 vol->mftmirr_ino = NULL;
1860 }
1861 /*
1862 * If any dirty inodes are left, throw away all mft data page cache
1863 * pages to allow a clean umount. This should never happen any more
1864 * due to mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
1865 * the underlying mft records are written out and cleaned. If it does,
1866 * happen anyway, we want to know...
1867 */
1868 ntfs_commit_inode(vol->mft_ino);
1869 write_inode_now(vol->mft_ino, 1);
1870 if (!list_empty(&sb->s_dirty)) {
1871 const char *s1, *s2;
1872
1873 down(&vol->mft_ino->i_sem);
1874 truncate_inode_pages(vol->mft_ino->i_mapping, 0);
1875 up(&vol->mft_ino->i_sem);
1876 write_inode_now(vol->mft_ino, 1);
1877 if (!list_empty(&sb->s_dirty)) {
1878 static const char *_s1 = "inodes";
1879 static const char *_s2 = "";
1880 s1 = _s1;
1881 s2 = _s2;
1882 } else {
1883 static const char *_s1 = "mft pages";
1884 static const char *_s2 = "They have been thrown "
1885 "away. ";
1886 s1 = _s1;
1887 s2 = _s2;
1888 }
1889 ntfs_error(sb, "Dirty %s found at umount time. %sYou should "
1890 "run chkdsk. Please email "
1891 "linux-ntfs-dev@lists.sourceforge.net and say "
1892 "that you saw this message. Thank you.", s1,
1893 s2);
1894 }
1895#endif /* NTFS_RW */
1896
1897 iput(vol->mft_ino);
1898 vol->mft_ino = NULL;
1899
1900 /* Throw away the table of attribute definitions. */
1901 vol->attrdef_size = 0;
1902 if (vol->attrdef) {
1903 ntfs_free(vol->attrdef);
1904 vol->attrdef = NULL;
1905 }
1906 vol->upcase_len = 0;
1907 /*
1908 * Destroy the global default upcase table if necessary. Also decrease
1909 * the number of upcase users if we are a user.
1910 */
1911 down(&ntfs_lock);
1912 if (vol->upcase == default_upcase) {
1913 ntfs_nr_upcase_users--;
1914 vol->upcase = NULL;
1915 }
1916 if (!ntfs_nr_upcase_users && default_upcase) {
1917 ntfs_free(default_upcase);
1918 default_upcase = NULL;
1919 }
1920 if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
1921 free_compression_buffers();
1922 up(&ntfs_lock);
1923 if (vol->upcase) {
1924 ntfs_free(vol->upcase);
1925 vol->upcase = NULL;
1926 }
1927 if (vol->nls_map) {
1928 unload_nls(vol->nls_map);
1929 vol->nls_map = NULL;
1930 }
1931 sb->s_fs_info = NULL;
1932 kfree(vol);
1933 return;
1934}
1935
1936/**
1937 * get_nr_free_clusters - return the number of free clusters on a volume
1938 * @vol: ntfs volume for which to obtain free cluster count
1939 *
1940 * Calculate the number of free clusters on the mounted NTFS volume @vol. We
1941 * actually calculate the number of clusters in use instead because this
1942 * allows us to not care about partial pages as these will be just zero filled
1943 * and hence not be counted as allocated clusters.
1944 *
1945 * The only particularity is that clusters beyond the end of the logical ntfs
1946 * volume will be marked as allocated to prevent errors which means we have to
1947 * discount those at the end. This is important as the cluster bitmap always
1948 * has a size in multiples of 8 bytes, i.e. up to 63 clusters could be outside
1949 * the logical volume and marked in use when they are not as they do not exist.
1950 *
1951 * If any pages cannot be read we assume all clusters in the erroring pages are
1952 * in use. This means we return an underestimate on errors which is better than
1953 * an overestimate.
1954 */
1955static s64 get_nr_free_clusters(ntfs_volume *vol)
1956{
1957 s64 nr_free = vol->nr_clusters;
1958 u32 *kaddr;
1959 struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
1960 filler_t *readpage = (filler_t*)mapping->a_ops->readpage;
1961 struct page *page;
1962 unsigned long index, max_index;
1963 unsigned int max_size;
1964
1965 ntfs_debug("Entering.");
1966 /* Serialize accesses to the cluster bitmap. */
1967 down_read(&vol->lcnbmp_lock);
1968 /*
1969 * Convert the number of bits into bytes rounded up, then convert into
1970 * multiples of PAGE_CACHE_SIZE, rounding up so that if we have one
1971 * full and one partial page max_index = 2.
1972 */
1973 max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_CACHE_SIZE - 1) >>
1974 PAGE_CACHE_SHIFT;
1975 /* Use multiples of 4 bytes. */
1976 max_size = PAGE_CACHE_SIZE >> 2;
1977 ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%x.",
1978 max_index, max_size);
1979 for (index = 0UL; index < max_index; index++) {
1980 unsigned int i;
1981 /*
1982 * Read the page from page cache, getting it from backing store
1983 * if necessary, and increment the use count.
1984 */
1985 page = read_cache_page(mapping, index, (filler_t*)readpage,
1986 NULL);
1987 /* Ignore pages which errored synchronously. */
1988 if (IS_ERR(page)) {
1989 ntfs_debug("Sync read_cache_page() error. Skipping "
1990 "page (index 0x%lx).", index);
1991 nr_free -= PAGE_CACHE_SIZE * 8;
1992 continue;
1993 }
1994 wait_on_page_locked(page);
1995 /* Ignore pages which errored asynchronously. */
1996 if (!PageUptodate(page)) {
1997 ntfs_debug("Async read_cache_page() error. Skipping "
1998 "page (index 0x%lx).", index);
1999 page_cache_release(page);
2000 nr_free -= PAGE_CACHE_SIZE * 8;
2001 continue;
2002 }
2003 kaddr = (u32*)kmap_atomic(page, KM_USER0);
2004 /*
2005 * For each 4 bytes, subtract the number of set bits. If this
2006 * is the last page and it is partial we don't really care as
2007 * it just means we do a little extra work but it won't affect
2008 * the result as all out of range bytes are set to zero by
2009 * ntfs_readpage().
2010 */
2011 for (i = 0; i < max_size; i++)
2012 nr_free -= (s64)hweight32(kaddr[i]);
2013 kunmap_atomic(kaddr, KM_USER0);
2014 page_cache_release(page);
2015 }
2016 ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1);
2017 /*
2018 * Fixup for eventual bits outside logical ntfs volume (see function
2019 * description above).
2020 */
2021 if (vol->nr_clusters & 63)
2022 nr_free += 64 - (vol->nr_clusters & 63);
2023 up_read(&vol->lcnbmp_lock);
2024 /* If errors occured we may well have gone below zero, fix this. */
2025 if (nr_free < 0)
2026 nr_free = 0;
2027 ntfs_debug("Exiting.");
2028 return nr_free;
2029}
2030
2031/**
2032 * __get_nr_free_mft_records - return the number of free inodes on a volume
2033 * @vol: ntfs volume for which to obtain free inode count
2034 *
2035 * Calculate the number of free mft records (inodes) on the mounted NTFS
2036 * volume @vol. We actually calculate the number of mft records in use instead
2037 * because this allows us to not care about partial pages as these will be just
2038 * zero filled and hence not be counted as allocated mft record.
2039 *
2040 * If any pages cannot be read we assume all mft records in the erroring pages
2041 * are in use. This means we return an underestimate on errors which is better
2042 * than an overestimate.
2043 *
2044 * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing.
2045 */
2046static unsigned long __get_nr_free_mft_records(ntfs_volume *vol)
2047{
2048 s64 nr_free;
2049 u32 *kaddr;
2050 struct address_space *mapping = vol->mftbmp_ino->i_mapping;
2051 filler_t *readpage = (filler_t*)mapping->a_ops->readpage;
2052 struct page *page;
2053 unsigned long index, max_index;
2054 unsigned int max_size;
2055
2056 ntfs_debug("Entering.");
2057 /* Number of mft records in file system (at this point in time). */
2058 nr_free = vol->mft_ino->i_size >> vol->mft_record_size_bits;
2059 /*
2060 * Convert the maximum number of set bits into bytes rounded up, then
2061 * convert into multiples of PAGE_CACHE_SIZE, rounding up so that if we
2062 * have one full and one partial page max_index = 2.
2063 */
2064 max_index = ((((NTFS_I(vol->mft_ino)->initialized_size >>
2065 vol->mft_record_size_bits) + 7) >> 3) +
2066 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2067 /* Use multiples of 4 bytes. */
2068 max_size = PAGE_CACHE_SIZE >> 2;
2069 ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
2070 "0x%x.", max_index, max_size);
2071 for (index = 0UL; index < max_index; index++) {
2072 unsigned int i;
2073 /*
2074 * Read the page from page cache, getting it from backing store
2075 * if necessary, and increment the use count.
2076 */
2077 page = read_cache_page(mapping, index, (filler_t*)readpage,
2078 NULL);
2079 /* Ignore pages which errored synchronously. */
2080 if (IS_ERR(page)) {
2081 ntfs_debug("Sync read_cache_page() error. Skipping "
2082 "page (index 0x%lx).", index);
2083 nr_free -= PAGE_CACHE_SIZE * 8;
2084 continue;
2085 }
2086 wait_on_page_locked(page);
2087 /* Ignore pages which errored asynchronously. */
2088 if (!PageUptodate(page)) {
2089 ntfs_debug("Async read_cache_page() error. Skipping "
2090 "page (index 0x%lx).", index);
2091 page_cache_release(page);
2092 nr_free -= PAGE_CACHE_SIZE * 8;
2093 continue;
2094 }
2095 kaddr = (u32*)kmap_atomic(page, KM_USER0);
2096 /*
2097 * For each 4 bytes, subtract the number of set bits. If this
2098 * is the last page and it is partial we don't really care as
2099 * it just means we do a little extra work but it won't affect
2100 * the result as all out of range bytes are set to zero by
2101 * ntfs_readpage().
2102 */
2103 for (i = 0; i < max_size; i++)
2104 nr_free -= (s64)hweight32(kaddr[i]);
2105 kunmap_atomic(kaddr, KM_USER0);
2106 page_cache_release(page);
2107 }
2108 ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.",
2109 index - 1);
2110 /* If errors occured we may well have gone below zero, fix this. */
2111 if (nr_free < 0)
2112 nr_free = 0;
2113 ntfs_debug("Exiting.");
2114 return nr_free;
2115}
2116
2117/**
2118 * ntfs_statfs - return information about mounted NTFS volume
2119 * @sb: super block of mounted volume
2120 * @sfs: statfs structure in which to return the information
2121 *
2122 * Return information about the mounted NTFS volume @sb in the statfs structure
2123 * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is
2124 * called). We interpret the values to be correct of the moment in time at
2125 * which we are called. Most values are variable otherwise and this isn't just
2126 * the free values but the totals as well. For example we can increase the
2127 * total number of file nodes if we run out and we can keep doing this until
2128 * there is no more space on the volume left at all.
2129 *
2130 * Called from vfs_statfs which is used to handle the statfs, fstatfs, and
2131 * ustat system calls.
2132 *
2133 * Return 0 on success or -errno on error.
2134 */
2135static int ntfs_statfs(struct super_block *sb, struct kstatfs *sfs)
2136{
2137 ntfs_volume *vol = NTFS_SB(sb);
2138 s64 size;
2139
2140 ntfs_debug("Entering.");
2141 /* Type of filesystem. */
2142 sfs->f_type = NTFS_SB_MAGIC;
2143 /* Optimal transfer block size. */
2144 sfs->f_bsize = PAGE_CACHE_SIZE;
2145 /*
2146 * Total data blocks in file system in units of f_bsize and since
2147 * inodes are also stored in data blocs ($MFT is a file) this is just
2148 * the total clusters.
2149 */
2150 sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >>
2151 PAGE_CACHE_SHIFT;
2152 /* Free data blocks in file system in units of f_bsize. */
2153 size = get_nr_free_clusters(vol) << vol->cluster_size_bits >>
2154 PAGE_CACHE_SHIFT;
2155 if (size < 0LL)
2156 size = 0LL;
2157 /* Free blocks avail to non-superuser, same as above on NTFS. */
2158 sfs->f_bavail = sfs->f_bfree = size;
2159 /* Serialize accesses to the inode bitmap. */
2160 down_read(&vol->mftbmp_lock);
2161 /* Number of inodes in file system (at this point in time). */
2162 sfs->f_files = vol->mft_ino->i_size >> vol->mft_record_size_bits;
2163 /* Free inodes in fs (based on current total count). */
2164 sfs->f_ffree = __get_nr_free_mft_records(vol);
2165 up_read(&vol->mftbmp_lock);
2166 /*
2167 * File system id. This is extremely *nix flavour dependent and even
2168 * within Linux itself all fs do their own thing. I interpret this to
2169 * mean a unique id associated with the mounted fs and not the id
2170 * associated with the file system driver, the latter is already given
2171 * by the file system type in sfs->f_type. Thus we use the 64-bit
2172 * volume serial number splitting it into two 32-bit parts. We enter
2173 * the least significant 32-bits in f_fsid[0] and the most significant
2174 * 32-bits in f_fsid[1].
2175 */
2176 sfs->f_fsid.val[0] = vol->serial_no & 0xffffffff;
2177 sfs->f_fsid.val[1] = (vol->serial_no >> 32) & 0xffffffff;
2178 /* Maximum length of filenames. */
2179 sfs->f_namelen = NTFS_MAX_NAME_LEN;
2180 return 0;
2181}
2182
2183/**
2184 * The complete super operations.
2185 */
2186static struct super_operations ntfs_sops = {
2187 .alloc_inode = ntfs_alloc_big_inode, /* VFS: Allocate new inode. */
2188 .destroy_inode = ntfs_destroy_big_inode, /* VFS: Deallocate inode. */
2189 .put_inode = ntfs_put_inode, /* VFS: Called just before
2190 the inode reference count
2191 is decreased. */
2192#ifdef NTFS_RW
2193 //.dirty_inode = NULL, /* VFS: Called from
2194 // __mark_inode_dirty(). */
2195 .write_inode = ntfs_write_inode, /* VFS: Write dirty inode to
2196 disk. */
2197 //.drop_inode = NULL, /* VFS: Called just after the
2198 // inode reference count has
2199 // been decreased to zero.
2200 // NOTE: The inode lock is
2201 // held. See fs/inode.c::
2202 // generic_drop_inode(). */
2203 //.delete_inode = NULL, /* VFS: Delete inode from disk.
2204 // Called when i_count becomes
2205 // 0 and i_nlink is also 0. */
2206 //.write_super = NULL, /* Flush dirty super block to
2207 // disk. */
2208 //.sync_fs = NULL, /* ? */
2209 //.write_super_lockfs = NULL, /* ? */
2210 //.unlockfs = NULL, /* ? */
2211#endif /* NTFS_RW */
2212 .put_super = ntfs_put_super, /* Syscall: umount. */
2213 .statfs = ntfs_statfs, /* Syscall: statfs */
2214 .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */
2215 .clear_inode = ntfs_clear_big_inode, /* VFS: Called when an inode is
2216 removed from memory. */
2217 //.umount_begin = NULL, /* Forced umount. */
2218 .show_options = ntfs_show_options, /* Show mount options in
2219 proc. */
2220};
2221
2222
2223/**
2224 * Declarations for NTFS specific export operations (fs/ntfs/namei.c).
2225 */
2226extern struct dentry *ntfs_get_parent(struct dentry *child_dent);
2227extern struct dentry *ntfs_get_dentry(struct super_block *sb, void *fh);
2228
2229/**
2230 * Export operations allowing NFS exporting of mounted NTFS partitions.
2231 *
2232 * We use the default ->decode_fh() and ->encode_fh() for now. Note that they
2233 * use 32 bits to store the inode number which is an unsigned long so on 64-bit
2234 * architectures is usually 64 bits so it would all fail horribly on huge
2235 * volumes. I guess we need to define our own encode and decode fh functions
2236 * that store 64-bit inode numbers at some point but for now we will ignore the
2237 * problem...
2238 *
2239 * We also use the default ->get_name() helper (used by ->decode_fh() via
2240 * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs
2241 * independent.
2242 *
2243 * The default ->get_parent() just returns -EACCES so we have to provide our
2244 * own and the default ->get_dentry() is incompatible with NTFS due to not
2245 * allowing the inode number 0 which is used in NTFS for the system file $MFT
2246 * and due to using iget() whereas NTFS needs ntfs_iget().
2247 */
2248static struct export_operations ntfs_export_ops = {
2249 .get_parent = ntfs_get_parent, /* Find the parent of a given
2250 directory. */
2251 .get_dentry = ntfs_get_dentry, /* Find a dentry for the inode
2252 given a file handle
2253 sub-fragment. */
2254};
2255
2256/**
2257 * ntfs_fill_super - mount an ntfs files system
2258 * @sb: super block of ntfs file system to mount
2259 * @opt: string containing the mount options
2260 * @silent: silence error output
2261 *
2262 * ntfs_fill_super() is called by the VFS to mount the device described by @sb
2263 * with the mount otions in @data with the NTFS file system.
2264 *
2265 * If @silent is true, remain silent even if errors are detected. This is used
2266 * during bootup, when the kernel tries to mount the root file system with all
2267 * registered file systems one after the other until one succeeds. This implies
2268 * that all file systems except the correct one will quite correctly and
2269 * expectedly return an error, but nobody wants to see error messages when in
2270 * fact this is what is supposed to happen.
2271 *
2272 * NOTE: @sb->s_flags contains the mount options flags.
2273 */
2274static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2275{
2276 ntfs_volume *vol;
2277 struct buffer_head *bh;
2278 struct inode *tmp_ino;
2279 int result;
2280
2281 ntfs_debug("Entering.");
2282#ifndef NTFS_RW
2283 sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
2284#endif /* ! NTFS_RW */
2285 /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */
2286 sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS);
2287 vol = NTFS_SB(sb);
2288 if (!vol) {
2289 if (!silent)
2290 ntfs_error(sb, "Allocation of NTFS volume structure "
2291 "failed. Aborting mount...");
2292 return -ENOMEM;
2293 }
2294 /* Initialize ntfs_volume structure. */
2295 memset(vol, 0, sizeof(ntfs_volume));
2296 vol->sb = sb;
2297 vol->upcase = NULL;
2298 vol->attrdef = NULL;
2299 vol->mft_ino = NULL;
2300 vol->mftbmp_ino = NULL;
2301 init_rwsem(&vol->mftbmp_lock);
2302#ifdef NTFS_RW
2303 vol->mftmirr_ino = NULL;
2304 vol->logfile_ino = NULL;
2305#endif /* NTFS_RW */
2306 vol->lcnbmp_ino = NULL;
2307 init_rwsem(&vol->lcnbmp_lock);
2308 vol->vol_ino = NULL;
2309 vol->root_ino = NULL;
2310 vol->secure_ino = NULL;
2311 vol->extend_ino = NULL;
2312#ifdef NTFS_RW
2313 vol->quota_ino = NULL;
2314 vol->quota_q_ino = NULL;
2315#endif /* NTFS_RW */
2316 vol->nls_map = NULL;
2317
2318 /*
2319 * Default is group and other don't have any access to files or
2320 * directories while owner has full access. Further, files by default
2321 * are not executable but directories are of course browseable.
2322 */
2323 vol->fmask = 0177;
2324 vol->dmask = 0077;
2325
2326 unlock_kernel();
2327
2328 /* Important to get the mount options dealt with now. */
2329 if (!parse_options(vol, (char*)opt))
2330 goto err_out_now;
2331
2332 /*
2333 * TODO: Fail safety check. In the future we should really be able to
2334 * cope with this being the case, but for now just bail out.
2335 */
2336 if (bdev_hardsect_size(sb->s_bdev) > NTFS_BLOCK_SIZE) {
2337 if (!silent)
2338 ntfs_error(sb, "Device has unsupported hardsect_size.");
2339 goto err_out_now;
2340 }
2341
2342 /* Setup the device access block size to NTFS_BLOCK_SIZE. */
2343 if (sb_set_blocksize(sb, NTFS_BLOCK_SIZE) != NTFS_BLOCK_SIZE) {
2344 if (!silent)
2345 ntfs_error(sb, "Unable to set block size.");
2346 goto err_out_now;
2347 }
2348
2349 /* Get the size of the device in units of NTFS_BLOCK_SIZE bytes. */
2350 vol->nr_blocks = sb->s_bdev->bd_inode->i_size >> NTFS_BLOCK_SIZE_BITS;
2351
2352 /* Read the boot sector and return unlocked buffer head to it. */
2353 if (!(bh = read_ntfs_boot_sector(sb, silent))) {
2354 if (!silent)
2355 ntfs_error(sb, "Not an NTFS volume.");
2356 goto err_out_now;
2357 }
2358
2359 /*
2360 * Extract the data from the boot sector and setup the ntfs super block
2361 * using it.
2362 */
2363 result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data);
2364
2365 /* Initialize the cluster and mft allocators. */
2366 ntfs_setup_allocators(vol);
2367
2368 brelse(bh);
2369
2370 if (!result) {
2371 if (!silent)
2372 ntfs_error(sb, "Unsupported NTFS filesystem.");
2373 goto err_out_now;
2374 }
2375
2376 /*
2377 * TODO: When we start coping with sector sizes different from
2378 * NTFS_BLOCK_SIZE, we now probably need to set the blocksize of the
2379 * device (probably to NTFS_BLOCK_SIZE).
2380 */
2381
2382 /* Setup remaining fields in the super block. */
2383 sb->s_magic = NTFS_SB_MAGIC;
2384
2385 /*
2386 * Ntfs allows 63 bits for the file size, i.e. correct would be:
2387 * sb->s_maxbytes = ~0ULL >> 1;
2388 * But the kernel uses a long as the page cache page index which on
2389 * 32-bit architectures is only 32-bits. MAX_LFS_FILESIZE is kernel
2390 * defined to the maximum the page cache page index can cope with
2391 * without overflowing the index or to 2^63 - 1, whichever is smaller.
2392 */
2393 sb->s_maxbytes = MAX_LFS_FILESIZE;
2394
2395 sb->s_time_gran = 100;
2396
2397 /*
2398 * Now load the metadata required for the page cache and our address
2399 * space operations to function. We do this by setting up a specialised
2400 * read_inode method and then just calling the normal iget() to obtain
2401 * the inode for $MFT which is sufficient to allow our normal inode
2402 * operations and associated address space operations to function.
2403 */
2404 sb->s_op = &ntfs_sops;
2405 tmp_ino = new_inode(sb);
2406 if (!tmp_ino) {
2407 if (!silent)
2408 ntfs_error(sb, "Failed to load essential metadata.");
2409 goto err_out_now;
2410 }
2411 tmp_ino->i_ino = FILE_MFT;
2412 insert_inode_hash(tmp_ino);
2413 if (ntfs_read_inode_mount(tmp_ino) < 0) {
2414 if (!silent)
2415 ntfs_error(sb, "Failed to load essential metadata.");
2416 goto iput_tmp_ino_err_out_now;
2417 }
2418 down(&ntfs_lock);
2419 /*
2420 * The current mount is a compression user if the cluster size is
2421 * less than or equal 4kiB.
2422 */
2423 if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) {
2424 result = allocate_compression_buffers();
2425 if (result) {
2426 ntfs_error(NULL, "Failed to allocate buffers "
2427 "for compression engine.");
2428 ntfs_nr_compression_users--;
2429 up(&ntfs_lock);
2430 goto iput_tmp_ino_err_out_now;
2431 }
2432 }
2433 /*
2434 * Generate the global default upcase table if necessary. Also
2435 * temporarily increment the number of upcase users to avoid race
2436 * conditions with concurrent (u)mounts.
2437 */
2438 if (!default_upcase)
2439 default_upcase = generate_default_upcase();
2440 ntfs_nr_upcase_users++;
2441 up(&ntfs_lock);
2442 /*
2443 * From now on, ignore @silent parameter. If we fail below this line,
2444 * it will be due to a corrupt fs or a system error, so we report it.
2445 */
2446 /*
2447 * Open the system files with normal access functions and complete
2448 * setting up the ntfs super block.
2449 */
2450 if (!load_system_files(vol)) {
2451 ntfs_error(sb, "Failed to load system files.");
2452 goto unl_upcase_iput_tmp_ino_err_out_now;
2453 }
2454 if ((sb->s_root = d_alloc_root(vol->root_ino))) {
2455 /* We increment i_count simulating an ntfs_iget(). */
2456 atomic_inc(&vol->root_ino->i_count);
2457 ntfs_debug("Exiting, status successful.");
2458 /* Release the default upcase if it has no users. */
2459 down(&ntfs_lock);
2460 if (!--ntfs_nr_upcase_users && default_upcase) {
2461 ntfs_free(default_upcase);
2462 default_upcase = NULL;
2463 }
2464 up(&ntfs_lock);
2465 sb->s_export_op = &ntfs_export_ops;
2466 lock_kernel();
2467 return 0;
2468 }
2469 ntfs_error(sb, "Failed to allocate root directory.");
2470 /* Clean up after the successful load_system_files() call from above. */
2471 // TODO: Use ntfs_put_super() instead of repeating all this code...
2472 // FIXME: Should mark the volume clean as the error is most likely
2473 // -ENOMEM.
2474 iput(vol->vol_ino);
2475 vol->vol_ino = NULL;
2476 /* NTFS 3.0+ specific clean up. */
2477 if (vol->major_ver >= 3) {
2478#ifdef NTFS_RW
2479 if (vol->quota_q_ino) {
2480 iput(vol->quota_q_ino);
2481 vol->quota_q_ino = NULL;
2482 }
2483 if (vol->quota_ino) {
2484 iput(vol->quota_ino);
2485 vol->quota_ino = NULL;
2486 }
2487#endif /* NTFS_RW */
2488 if (vol->extend_ino) {
2489 iput(vol->extend_ino);
2490 vol->extend_ino = NULL;
2491 }
2492 if (vol->secure_ino) {
2493 iput(vol->secure_ino);
2494 vol->secure_ino = NULL;
2495 }
2496 }
2497 iput(vol->root_ino);
2498 vol->root_ino = NULL;
2499 iput(vol->lcnbmp_ino);
2500 vol->lcnbmp_ino = NULL;
2501 iput(vol->mftbmp_ino);
2502 vol->mftbmp_ino = NULL;
2503#ifdef NTFS_RW
2504 if (vol->logfile_ino) {
2505 iput(vol->logfile_ino);
2506 vol->logfile_ino = NULL;
2507 }
2508 if (vol->mftmirr_ino) {
2509 iput(vol->mftmirr_ino);
2510 vol->mftmirr_ino = NULL;
2511 }
2512#endif /* NTFS_RW */
2513 /* Throw away the table of attribute definitions. */
2514 vol->attrdef_size = 0;
2515 if (vol->attrdef) {
2516 ntfs_free(vol->attrdef);
2517 vol->attrdef = NULL;
2518 }
2519 vol->upcase_len = 0;
2520 down(&ntfs_lock);
2521 if (vol->upcase == default_upcase) {
2522 ntfs_nr_upcase_users--;
2523 vol->upcase = NULL;
2524 }
2525 up(&ntfs_lock);
2526 if (vol->upcase) {
2527 ntfs_free(vol->upcase);
2528 vol->upcase = NULL;
2529 }
2530 if (vol->nls_map) {
2531 unload_nls(vol->nls_map);
2532 vol->nls_map = NULL;
2533 }
2534 /* Error exit code path. */
2535unl_upcase_iput_tmp_ino_err_out_now:
2536 /*
2537 * Decrease the number of upcase users and destroy the global default
2538 * upcase table if necessary.
2539 */
2540 down(&ntfs_lock);
2541 if (!--ntfs_nr_upcase_users && default_upcase) {
2542 ntfs_free(default_upcase);
2543 default_upcase = NULL;
2544 }
2545 if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
2546 free_compression_buffers();
2547 up(&ntfs_lock);
2548iput_tmp_ino_err_out_now:
2549 iput(tmp_ino);
2550 if (vol->mft_ino && vol->mft_ino != tmp_ino)
2551 iput(vol->mft_ino);
2552 vol->mft_ino = NULL;
2553 /*
2554 * This is needed to get ntfs_clear_extent_inode() called for each
2555 * inode we have ever called ntfs_iget()/iput() on, otherwise we A)
2556 * leak resources and B) a subsequent mount fails automatically due to
2557 * ntfs_iget() never calling down into our ntfs_read_locked_inode()
2558 * method again... FIXME: Do we need to do this twice now because of
2559 * attribute inodes? I think not, so leave as is for now... (AIA)
2560 */
2561 if (invalidate_inodes(sb)) {
2562 ntfs_error(sb, "Busy inodes left. This is most likely a NTFS "
2563 "driver bug.");
2564 /* Copied from fs/super.c. I just love this message. (-; */
2565 printk("NTFS: Busy inodes after umount. Self-destruct in 5 "
2566 "seconds. Have a nice day...\n");
2567 }
2568 /* Errors at this stage are irrelevant. */
2569err_out_now:
2570 lock_kernel();
2571 sb->s_fs_info = NULL;
2572 kfree(vol);
2573 ntfs_debug("Failed, returning -EINVAL.");
2574 return -EINVAL;
2575}
2576
2577/*
2578 * This is a slab cache to optimize allocations and deallocations of Unicode
2579 * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN
2580 * (255) Unicode characters + a terminating NULL Unicode character.
2581 */
2582kmem_cache_t *ntfs_name_cache;
2583
2584/* Slab caches for efficient allocation/deallocation of of inodes. */
2585kmem_cache_t *ntfs_inode_cache;
2586kmem_cache_t *ntfs_big_inode_cache;
2587
2588/* Init once constructor for the inode slab cache. */
2589static void ntfs_big_inode_init_once(void *foo, kmem_cache_t *cachep,
2590 unsigned long flags)
2591{
2592 ntfs_inode *ni = (ntfs_inode *)foo;
2593
2594 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2595 SLAB_CTOR_CONSTRUCTOR)
2596 inode_init_once(VFS_I(ni));
2597}
2598
2599/*
2600 * Slab caches to optimize allocations and deallocations of attribute search
2601 * contexts and index contexts, respectively.
2602 */
2603kmem_cache_t *ntfs_attr_ctx_cache;
2604kmem_cache_t *ntfs_index_ctx_cache;
2605
2606/* Driver wide semaphore. */
2607DECLARE_MUTEX(ntfs_lock);
2608
2609static struct super_block *ntfs_get_sb(struct file_system_type *fs_type,
2610 int flags, const char *dev_name, void *data)
2611{
2612 return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
2613}
2614
2615static struct file_system_type ntfs_fs_type = {
2616 .owner = THIS_MODULE,
2617 .name = "ntfs",
2618 .get_sb = ntfs_get_sb,
2619 .kill_sb = kill_block_super,
2620 .fs_flags = FS_REQUIRES_DEV,
2621};
2622
2623/* Stable names for the slab caches. */
2624static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache";
2625static const char ntfs_attr_ctx_cache_name[] = "ntfs_attr_ctx_cache";
2626static const char ntfs_name_cache_name[] = "ntfs_name_cache";
2627static const char ntfs_inode_cache_name[] = "ntfs_inode_cache";
2628static const char ntfs_big_inode_cache_name[] = "ntfs_big_inode_cache";
2629
2630static int __init init_ntfs_fs(void)
2631{
2632 int err = 0;
2633
2634 /* This may be ugly but it results in pretty output so who cares. (-8 */
2635 printk(KERN_INFO "NTFS driver " NTFS_VERSION " [Flags: R/"
2636#ifdef NTFS_RW
2637 "W"
2638#else
2639 "O"
2640#endif
2641#ifdef DEBUG
2642 " DEBUG"
2643#endif
2644#ifdef MODULE
2645 " MODULE"
2646#endif
2647 "].\n");
2648
2649 ntfs_debug("Debug messages are enabled.");
2650
2651 ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name,
2652 sizeof(ntfs_index_context), 0 /* offset */,
2653 SLAB_HWCACHE_ALIGN, NULL /* ctor */, NULL /* dtor */);
2654 if (!ntfs_index_ctx_cache) {
2655 printk(KERN_CRIT "NTFS: Failed to create %s!\n",
2656 ntfs_index_ctx_cache_name);
2657 goto ictx_err_out;
2658 }
2659 ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
2660 sizeof(ntfs_attr_search_ctx), 0 /* offset */,
2661 SLAB_HWCACHE_ALIGN, NULL /* ctor */, NULL /* dtor */);
2662 if (!ntfs_attr_ctx_cache) {
2663 printk(KERN_CRIT "NTFS: Failed to create %s!\n",
2664 ntfs_attr_ctx_cache_name);
2665 goto actx_err_out;
2666 }
2667
2668 ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name,
2669 (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
2670 SLAB_HWCACHE_ALIGN, NULL, NULL);
2671 if (!ntfs_name_cache) {
2672 printk(KERN_CRIT "NTFS: Failed to create %s!\n",
2673 ntfs_name_cache_name);
2674 goto name_err_out;
2675 }
2676
2677 ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name,
2678 sizeof(ntfs_inode), 0,
2679 SLAB_RECLAIM_ACCOUNT, NULL, NULL);
2680 if (!ntfs_inode_cache) {
2681 printk(KERN_CRIT "NTFS: Failed to create %s!\n",
2682 ntfs_inode_cache_name);
2683 goto inode_err_out;
2684 }
2685
2686 ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
2687 sizeof(big_ntfs_inode), 0,
2688 SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
2689 ntfs_big_inode_init_once, NULL);
2690 if (!ntfs_big_inode_cache) {
2691 printk(KERN_CRIT "NTFS: Failed to create %s!\n",
2692 ntfs_big_inode_cache_name);
2693 goto big_inode_err_out;
2694 }
2695
2696 /* Register the ntfs sysctls. */
2697 err = ntfs_sysctl(1);
2698 if (err) {
2699 printk(KERN_CRIT "NTFS: Failed to register NTFS sysctls!\n");
2700 goto sysctl_err_out;
2701 }
2702
2703 err = register_filesystem(&ntfs_fs_type);
2704 if (!err) {
2705 ntfs_debug("NTFS driver registered successfully.");
2706 return 0; /* Success! */
2707 }
2708 printk(KERN_CRIT "NTFS: Failed to register NTFS file system driver!\n");
2709
2710sysctl_err_out:
2711 kmem_cache_destroy(ntfs_big_inode_cache);
2712big_inode_err_out:
2713 kmem_cache_destroy(ntfs_inode_cache);
2714inode_err_out:
2715 kmem_cache_destroy(ntfs_name_cache);
2716name_err_out:
2717 kmem_cache_destroy(ntfs_attr_ctx_cache);
2718actx_err_out:
2719 kmem_cache_destroy(ntfs_index_ctx_cache);
2720ictx_err_out:
2721 if (!err) {
2722 printk(KERN_CRIT "NTFS: Aborting NTFS file system driver "
2723 "registration...\n");
2724 err = -ENOMEM;
2725 }
2726 return err;
2727}
2728
2729static void __exit exit_ntfs_fs(void)
2730{
2731 int err = 0;
2732
2733 ntfs_debug("Unregistering NTFS driver.");
2734
2735 unregister_filesystem(&ntfs_fs_type);
2736
2737 if (kmem_cache_destroy(ntfs_big_inode_cache) && (err = 1))
2738 printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
2739 ntfs_big_inode_cache_name);
2740 if (kmem_cache_destroy(ntfs_inode_cache) && (err = 1))
2741 printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
2742 ntfs_inode_cache_name);
2743 if (kmem_cache_destroy(ntfs_name_cache) && (err = 1))
2744 printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
2745 ntfs_name_cache_name);
2746 if (kmem_cache_destroy(ntfs_attr_ctx_cache) && (err = 1))
2747 printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
2748 ntfs_attr_ctx_cache_name);
2749 if (kmem_cache_destroy(ntfs_index_ctx_cache) && (err = 1))
2750 printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
2751 ntfs_index_ctx_cache_name);
2752 if (err)
2753 printk(KERN_CRIT "NTFS: This causes memory to leak! There is "
2754 "probably a BUG in the driver! Please report "
2755 "you saw this message to "
2756 "linux-ntfs-dev@lists.sourceforge.net\n");
2757 /* Unregister the ntfs sysctls. */
2758 ntfs_sysctl(0);
2759}
2760
2761MODULE_AUTHOR("Anton Altaparmakov <aia21@cantab.net>");
2762MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2004 Anton Altaparmakov");
2763MODULE_VERSION(NTFS_VERSION);
2764MODULE_LICENSE("GPL");
2765#ifdef DEBUG
2766module_param(debug_msgs, bool, 0);
2767MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
2768#endif
2769
2770module_init(init_ntfs_fs)
2771module_exit(exit_ntfs_fs)
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
new file mode 100644
index 000000000000..75067e4f3036
--- /dev/null
+++ b/fs/ntfs/sysctl.c
@@ -0,0 +1,85 @@
1/*
2 * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of
3 * the Linux-NTFS project. Adapted from the old NTFS driver,
4 * Copyright (C) 1997 Martin von Löwis, Régis Duchesne
5 *
6 * Copyright (c) 2002-2004 Anton Altaparmakov
7 *
8 * This program/include file is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as published
10 * by the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program/include file is distributed in the hope that it will be
14 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program (in the main directory of the Linux-NTFS
20 * distribution in the file COPYING); if not, write to the Free Software
21 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24#ifdef DEBUG
25
26#include <linux/module.h>
27
28#ifdef CONFIG_SYSCTL
29
30#include <linux/proc_fs.h>
31#include <linux/sysctl.h>
32
33#include "sysctl.h"
34#include "debug.h"
35
36#define FS_NTFS 1
37
38/* Definition of the ntfs sysctl. */
39static ctl_table ntfs_sysctls[] = {
40 { FS_NTFS, "ntfs-debug", /* Binary and text IDs. */
41 &debug_msgs,sizeof(debug_msgs), /* Data pointer and size. */
42 0644, NULL, &proc_dointvec }, /* Mode, child, proc handler. */
43 { 0 }
44};
45
46/* Define the parent directory /proc/sys/fs. */
47static ctl_table sysctls_root[] = {
48 { CTL_FS, "fs", NULL, 0, 0555, ntfs_sysctls },
49 { 0 }
50};
51
52/* Storage for the sysctls header. */
53static struct ctl_table_header *sysctls_root_table = NULL;
54
55/**
56 * ntfs_sysctl - add or remove the debug sysctl
57 * @add: add (1) or remove (0) the sysctl
58 *
59 * Add or remove the debug sysctl. Return 0 on success or -errno on error.
60 */
61int ntfs_sysctl(int add)
62{
63 if (add) {
64 BUG_ON(sysctls_root_table);
65 sysctls_root_table = register_sysctl_table(sysctls_root, 0);
66 if (!sysctls_root_table)
67 return -ENOMEM;
68#ifdef CONFIG_PROC_FS
69 /*
70 * If the proc file system is in use and we are a module, need
71 * to set the owner of our proc entry to our module. In the
72 * non-modular case, THIS_MODULE is NULL, so this is ok.
73 */
74 ntfs_sysctls[0].de->owner = THIS_MODULE;
75#endif
76 } else {
77 BUG_ON(!sysctls_root_table);
78 unregister_sysctl_table(sysctls_root_table);
79 sysctls_root_table = NULL;
80 }
81 return 0;
82}
83
84#endif /* CONFIG_SYSCTL */
85#endif /* DEBUG */
diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h
new file mode 100644
index 000000000000..df749cc0aac8
--- /dev/null
+++ b/fs/ntfs/sysctl.h
@@ -0,0 +1,42 @@
1/*
2 * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of
3 * the Linux-NTFS project. Adapted from the old NTFS driver,
4 * Copyright (C) 1997 Martin von Löwis, Régis Duchesne
5 *
6 * Copyright (c) 2002-2004 Anton Altaparmakov
7 *
8 * This program/include file is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as published
10 * by the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program/include file is distributed in the hope that it will be
14 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program (in the main directory of the Linux-NTFS
20 * distribution in the file COPYING); if not, write to the Free Software
21 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24#ifndef _LINUX_NTFS_SYSCTL_H
25#define _LINUX_NTFS_SYSCTL_H
26
27#include <linux/config.h>
28
29#if (DEBUG && CONFIG_SYSCTL)
30
31extern int ntfs_sysctl(int add);
32
33#else
34
35/* Just return success. */
36static inline int ntfs_sysctl(int add)
37{
38 return 0;
39}
40
41#endif /* DEBUG && CONFIG_SYSCTL */
42#endif /* _LINUX_NTFS_SYSCTL_H */
diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h
new file mode 100644
index 000000000000..a09a51dabe4e
--- /dev/null
+++ b/fs/ntfs/time.h
@@ -0,0 +1,100 @@
1/*
2 * time.h - NTFS time conversion functions. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2001-2004 Anton Altaparmakov
5 *
6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published
8 * by the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program/include file is distributed in the hope that it will be
12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program (in the main directory of the Linux-NTFS
18 * distribution in the file COPYING); if not, write to the Free Software
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#ifndef _LINUX_NTFS_TIME_H
23#define _LINUX_NTFS_TIME_H
24
25#include <linux/time.h> /* For current_kernel_time(). */
26#include <asm/div64.h> /* For do_div(). */
27
28#include "endian.h"
29
30#define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000)
31
32/**
33 * utc2ntfs - convert Linux UTC time to NTFS time
34 * @ts: Linux UTC time to convert to NTFS time
35 *
36 * Convert the Linux UTC time @ts to its corresponding NTFS time and return
37 * that in little endian format.
38 *
39 * Linux stores time in a struct timespec consisting of a time_t (long at
40 * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second
41 * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of
42 * 1-nano-second intervals since the value of tv_sec.
43 *
44 * NTFS uses Microsoft's standard time format which is stored in a s64 and is
45 * measured as the number of 100-nano-second intervals since 1st January 1601,
46 * 00:00:00 UTC.
47 */
48static inline sle64 utc2ntfs(const struct timespec ts)
49{
50 /*
51 * Convert the seconds to 100ns intervals, add the nano-seconds
52 * converted to 100ns intervals, and then add the NTFS time offset.
53 */
54 return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 +
55 NTFS_TIME_OFFSET);
56}
57
58/**
59 * get_current_ntfs_time - get the current time in little endian NTFS format
60 *
61 * Get the current time from the Linux kernel, convert it to its corresponding
62 * NTFS time and return that in little endian format.
63 */
64static inline sle64 get_current_ntfs_time(void)
65{
66 return utc2ntfs(current_kernel_time());
67}
68
69/**
70 * ntfs2utc - convert NTFS time to Linux time
71 * @time: NTFS time (little endian) to convert to Linux UTC
72 *
73 * Convert the little endian NTFS time @time to its corresponding Linux UTC
74 * time and return that in cpu format.
75 *
76 * Linux stores time in a struct timespec consisting of a time_t (long at
77 * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second
78 * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of
79 * 1-nano-second intervals since the value of tv_sec.
80 *
81 * NTFS uses Microsoft's standard time format which is stored in a s64 and is
82 * measured as the number of 100 nano-second intervals since 1st January 1601,
83 * 00:00:00 UTC.
84 */
85static inline struct timespec ntfs2utc(const sle64 time)
86{
87 struct timespec ts;
88
89 /* Subtract the NTFS time offset. */
90 s64 t = sle64_to_cpu(time) - NTFS_TIME_OFFSET;
91 /*
92 * Convert the time to 1-second intervals and the remainder to
93 * 1-nano-second intervals.
94 */
95 ts.tv_nsec = do_div(t, 10000000) * 100;
96 ts.tv_sec = t;
97 return ts;
98}
99
100#endif /* _LINUX_NTFS_TIME_H */
diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h
new file mode 100644
index 000000000000..08a55aa53d4e
--- /dev/null
+++ b/fs/ntfs/types.h
@@ -0,0 +1,66 @@
1/*
2 * types.h - Defines for NTFS Linux kernel driver specific types.
3 * Part of the Linux-NTFS project.
4 *
5 * Copyright (c) 2001-2004 Anton Altaparmakov
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#ifndef _LINUX_NTFS_TYPES_H
24#define _LINUX_NTFS_TYPES_H
25
26#include <linux/types.h>
27
28typedef __le16 le16;
29typedef __le32 le32;
30typedef __le64 le64;
31typedef __u16 __bitwise sle16;
32typedef __u32 __bitwise sle32;
33typedef __u64 __bitwise sle64;
34
35/* 2-byte Unicode character type. */
36typedef le16 ntfschar;
37#define UCHAR_T_SIZE_BITS 1
38
39/*
40 * Clusters are signed 64-bit values on NTFS volumes. We define two types, LCN
41 * and VCN, to allow for type checking and better code readability.
42 */
43typedef s64 VCN;
44typedef sle64 leVCN;
45typedef s64 LCN;
46typedef sle64 leLCN;
47
48/*
49 * The NTFS journal $LogFile uses log sequence numbers which are signed 64-bit
50 * values. We define our own type LSN, to allow for type checking and better
51 * code readability.
52 */
53typedef s64 LSN;
54typedef sle64 leLSN;
55
56typedef enum {
57 FALSE = 0,
58 TRUE = 1
59} BOOL;
60
61typedef enum {
62 CASE_SENSITIVE = 0,
63 IGNORE_CASE = 1,
64} IGNORE_CASE_BOOL;
65
66#endif /* _LINUX_NTFS_TYPES_H */
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
new file mode 100644
index 000000000000..560b0ea255b0
--- /dev/null
+++ b/fs/ntfs/unistr.c
@@ -0,0 +1,384 @@
1/*
2 * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2001-2004 Anton Altaparmakov
5 *
6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published
8 * by the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program/include file is distributed in the hope that it will be
12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program (in the main directory of the Linux-NTFS
18 * distribution in the file COPYING); if not, write to the Free Software
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include "types.h"
23#include "debug.h"
24#include "ntfs.h"
25
26/*
27 * IMPORTANT
28 * =========
29 *
30 * All these routines assume that the Unicode characters are in little endian
31 * encoding inside the strings!!!
32 */
33
34/*
35 * This is used by the name collation functions to quickly determine what
36 * characters are (in)valid.
37 */
38static const u8 legal_ansi_char_array[0x40] = {
39 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
40 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
41
42 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
43 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
44
45 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
46 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
47
48 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
49 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
50};
51
52/**
53 * ntfs_are_names_equal - compare two Unicode names for equality
54 * @s1: name to compare to @s2
55 * @s1_len: length in Unicode characters of @s1
56 * @s2: name to compare to @s1
57 * @s2_len: length in Unicode characters of @s2
58 * @ic: ignore case bool
59 * @upcase: upcase table (only if @ic == IGNORE_CASE)
60 * @upcase_size: length in Unicode characters of @upcase (if present)
61 *
62 * Compare the names @s1 and @s2 and return TRUE (1) if the names are
63 * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
64 * the @upcase table is used to performa a case insensitive comparison.
65 */
66BOOL ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
67 const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic,
68 const ntfschar *upcase, const u32 upcase_size)
69{
70 if (s1_len != s2_len)
71 return FALSE;
72 if (ic == CASE_SENSITIVE)
73 return !ntfs_ucsncmp(s1, s2, s1_len);
74 return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size);
75}
76
77/**
78 * ntfs_collate_names - collate two Unicode names
79 * @name1: first Unicode name to compare
80 * @name2: second Unicode name to compare
81 * @err_val: if @name1 contains an invalid character return this value
82 * @ic: either CASE_SENSITIVE or IGNORE_CASE
83 * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE)
84 * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE)
85 *
86 * ntfs_collate_names collates two Unicode names and returns:
87 *
88 * -1 if the first name collates before the second one,
89 * 0 if the names match,
90 * 1 if the second name collates before the first one, or
91 * @err_val if an invalid character is found in @name1 during the comparison.
92 *
93 * The following characters are considered invalid: '"', '*', '<', '>' and '?'.
94 */
95int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
96 const ntfschar *name2, const u32 name2_len,
97 const int err_val, const IGNORE_CASE_BOOL ic,
98 const ntfschar *upcase, const u32 upcase_len)
99{
100 u32 cnt, min_len;
101 u16 c1, c2;
102
103 min_len = name1_len;
104 if (name1_len > name2_len)
105 min_len = name2_len;
106 for (cnt = 0; cnt < min_len; ++cnt) {
107 c1 = le16_to_cpu(*name1++);
108 c2 = le16_to_cpu(*name2++);
109 if (ic) {
110 if (c1 < upcase_len)
111 c1 = le16_to_cpu(upcase[c1]);
112 if (c2 < upcase_len)
113 c2 = le16_to_cpu(upcase[c2]);
114 }
115 if (c1 < 64 && legal_ansi_char_array[c1] & 8)
116 return err_val;
117 if (c1 < c2)
118 return -1;
119 if (c1 > c2)
120 return 1;
121 }
122 if (name1_len < name2_len)
123 return -1;
124 if (name1_len == name2_len)
125 return 0;
126 /* name1_len > name2_len */
127 c1 = le16_to_cpu(*name1);
128 if (c1 < 64 && legal_ansi_char_array[c1] & 8)
129 return err_val;
130 return 1;
131}
132
133/**
134 * ntfs_ucsncmp - compare two little endian Unicode strings
135 * @s1: first string
136 * @s2: second string
137 * @n: maximum unicode characters to compare
138 *
139 * Compare the first @n characters of the Unicode strings @s1 and @s2,
140 * The strings in little endian format and appropriate le16_to_cpu()
141 * conversion is performed on non-little endian machines.
142 *
143 * The function returns an integer less than, equal to, or greater than zero
144 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
145 * to be less than, to match, or be greater than @s2.
146 */
147int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
148{
149 u16 c1, c2;
150 size_t i;
151
152 for (i = 0; i < n; ++i) {
153 c1 = le16_to_cpu(s1[i]);
154 c2 = le16_to_cpu(s2[i]);
155 if (c1 < c2)
156 return -1;
157 if (c1 > c2)
158 return 1;
159 if (!c1)
160 break;
161 }
162 return 0;
163}
164
165/**
166 * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
167 * @s1: first string
168 * @s2: second string
169 * @n: maximum unicode characters to compare
170 * @upcase: upcase table
171 * @upcase_size: upcase table size in Unicode characters
172 *
173 * Compare the first @n characters of the Unicode strings @s1 and @s2,
174 * ignoring case. The strings in little endian format and appropriate
175 * le16_to_cpu() conversion is performed on non-little endian machines.
176 *
177 * Each character is uppercased using the @upcase table before the comparison.
178 *
179 * The function returns an integer less than, equal to, or greater than zero
180 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
181 * to be less than, to match, or be greater than @s2.
182 */
183int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
184 const ntfschar *upcase, const u32 upcase_size)
185{
186 size_t i;
187 u16 c1, c2;
188
189 for (i = 0; i < n; ++i) {
190 if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
191 c1 = le16_to_cpu(upcase[c1]);
192 if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
193 c2 = le16_to_cpu(upcase[c2]);
194 if (c1 < c2)
195 return -1;
196 if (c1 > c2)
197 return 1;
198 if (!c1)
199 break;
200 }
201 return 0;
202}
203
204void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase,
205 const u32 upcase_len)
206{
207 u32 i;
208 u16 u;
209
210 for (i = 0; i < name_len; i++)
211 if ((u = le16_to_cpu(name[i])) < upcase_len)
212 name[i] = upcase[u];
213}
214
215void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
216 const ntfschar *upcase, const u32 upcase_len)
217{
218 ntfs_upcase_name((ntfschar*)&file_name_attr->file_name,
219 file_name_attr->file_name_length, upcase, upcase_len);
220}
221
222int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
223 FILE_NAME_ATTR *file_name_attr2,
224 const int err_val, const IGNORE_CASE_BOOL ic,
225 const ntfschar *upcase, const u32 upcase_len)
226{
227 return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name,
228 file_name_attr1->file_name_length,
229 (ntfschar*)&file_name_attr2->file_name,
230 file_name_attr2->file_name_length,
231 err_val, ic, upcase, upcase_len);
232}
233
234/**
235 * ntfs_nlstoucs - convert NLS string to little endian Unicode string
236 * @vol: ntfs volume which we are working with
237 * @ins: input NLS string buffer
238 * @ins_len: length of input string in bytes
239 * @outs: on return contains the allocated output Unicode string buffer
240 *
241 * Convert the input string @ins, which is in whatever format the loaded NLS
242 * map dictates, into a little endian, 2-byte Unicode string.
243 *
244 * This function allocates the string and the caller is responsible for
245 * calling kmem_cache_free(ntfs_name_cache, @outs); when finished with it.
246 *
247 * On success the function returns the number of Unicode characters written to
248 * the output string *@outs (>= 0), not counting the terminating Unicode NULL
249 * character. *@outs is set to the allocated output string buffer.
250 *
251 * On error, a negative number corresponding to the error code is returned. In
252 * that case the output string is not allocated. Both *@outs and *@outs_len
253 * are then undefined.
254 *
255 * This might look a bit odd due to fast path optimization...
256 */
257int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
258 const int ins_len, ntfschar **outs)
259{
260 struct nls_table *nls = vol->nls_map;
261 ntfschar *ucs;
262 wchar_t wc;
263 int i, o, wc_len;
264
265 /* We don't trust outside sources. */
266 if (ins) {
267 ucs = (ntfschar*)kmem_cache_alloc(ntfs_name_cache, SLAB_NOFS);
268 if (ucs) {
269 for (i = o = 0; i < ins_len; i += wc_len) {
270 wc_len = nls->char2uni(ins + i, ins_len - i,
271 &wc);
272 if (wc_len >= 0) {
273 if (wc) {
274 ucs[o++] = cpu_to_le16(wc);
275 continue;
276 } /* else (!wc) */
277 break;
278 } /* else (wc_len < 0) */
279 goto conversion_err;
280 }
281 ucs[o] = 0;
282 *outs = ucs;
283 return o;
284 } /* else (!ucs) */
285 ntfs_error(vol->sb, "Failed to allocate name from "
286 "ntfs_name_cache!");
287 return -ENOMEM;
288 } /* else (!ins) */
289 ntfs_error(NULL, "Received NULL pointer.");
290 return -EINVAL;
291conversion_err:
292 ntfs_error(vol->sb, "Name using character set %s contains characters "
293 "that cannot be converted to Unicode.", nls->charset);
294 kmem_cache_free(ntfs_name_cache, ucs);
295 return -EILSEQ;
296}
297
298/**
299 * ntfs_ucstonls - convert little endian Unicode string to NLS string
300 * @vol: ntfs volume which we are working with
301 * @ins: input Unicode string buffer
302 * @ins_len: length of input string in Unicode characters
303 * @outs: on return contains the (allocated) output NLS string buffer
304 * @outs_len: length of output string buffer in bytes
305 *
306 * Convert the input little endian, 2-byte Unicode string @ins, of length
307 * @ins_len into the string format dictated by the loaded NLS.
308 *
309 * If *@outs is NULL, this function allocates the string and the caller is
310 * responsible for calling kfree(*@outs); when finished with it. In this case
311 * @outs_len is ignored and can be 0.
312 *
313 * On success the function returns the number of bytes written to the output
314 * string *@outs (>= 0), not counting the terminating NULL byte. If the output
315 * string buffer was allocated, *@outs is set to it.
316 *
317 * On error, a negative number corresponding to the error code is returned. In
318 * that case the output string is not allocated. The contents of *@outs are
319 * then undefined.
320 *
321 * This might look a bit odd due to fast path optimization...
322 */
323int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
324 const int ins_len, unsigned char **outs, int outs_len)
325{
326 struct nls_table *nls = vol->nls_map;
327 unsigned char *ns;
328 int i, o, ns_len, wc;
329
330 /* We don't trust outside sources. */
331 if (ins) {
332 ns = *outs;
333 ns_len = outs_len;
334 if (ns && !ns_len) {
335 wc = -ENAMETOOLONG;
336 goto conversion_err;
337 }
338 if (!ns) {
339 ns_len = ins_len * NLS_MAX_CHARSET_SIZE;
340 ns = (unsigned char*)kmalloc(ns_len + 1, GFP_NOFS);
341 if (!ns)
342 goto mem_err_out;
343 }
344 for (i = o = 0; i < ins_len; i++) {
345retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
346 ns_len - o);
347 if (wc > 0) {
348 o += wc;
349 continue;
350 } else if (!wc)
351 break;
352 else if (wc == -ENAMETOOLONG && ns != *outs) {
353 unsigned char *tc;
354 /* Grow in multiples of 64 bytes. */
355 tc = (unsigned char*)kmalloc((ns_len + 64) &
356 ~63, GFP_NOFS);
357 if (tc) {
358 memcpy(tc, ns, ns_len);
359 ns_len = ((ns_len + 64) & ~63) - 1;
360 kfree(ns);
361 ns = tc;
362 goto retry;
363 } /* No memory so goto conversion_error; */
364 } /* wc < 0, real error. */
365 goto conversion_err;
366 }
367 ns[o] = 0;
368 *outs = ns;
369 return o;
370 } /* else (!ins) */
371 ntfs_error(vol->sb, "Received NULL pointer.");
372 return -EINVAL;
373conversion_err:
374 ntfs_error(vol->sb, "Unicode name contains characters that cannot be "
375 "converted to character set %s.", nls->charset);
376 if (ns != *outs)
377 kfree(ns);
378 if (wc != -ENAMETOOLONG)
379 wc = -EILSEQ;
380 return wc;
381mem_err_out:
382 ntfs_error(vol->sb, "Failed to allocate name!");
383 return -ENOMEM;
384}
diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c
new file mode 100644
index 000000000000..879cdf1d5bd3
--- /dev/null
+++ b/fs/ntfs/upcase.c
@@ -0,0 +1,90 @@
1/*
2 * upcase.c - Generate the full NTFS Unicode upcase table in little endian.
3 * Part of the Linux-NTFS project.
4 *
5 * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org>
6 * Copyright (c) 2001-2004 Anton Altaparmakov
7 *
8 * Modified for mkntfs inclusion 9 June 2001 by Anton Altaparmakov.
9 * Modified for kernel inclusion 10 September 2001 by Anton Altparmakov.
10 *
11 * This program is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the Free
13 * Software Foundation; either version 2 of the License, or (at your option)
14 * any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program (in the main directory of the Linux-NTFS source
23 * in the file COPYING); if not, write to the Free Software Foundation,
24 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 */
26
27#include "malloc.h"
28#include "ntfs.h"
29
30ntfschar *generate_default_upcase(void)
31{
32 static const int uc_run_table[][3] = { /* Start, End, Add */
33 {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74},
34 {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86},
35 {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
36 {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128},
37 {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112},
38 {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126},
39 {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8},
40 {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8},
41 {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8},
42 {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7},
43 {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16},
44 {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26},
45 {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32},
46 {0}
47 };
48
49 static const int uc_dup_table[][2] = { /* Start, End */
50 {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
51 {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
52 {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
53 {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
54 {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
55 {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
56 {0}
57 };
58
59 static const int uc_word_table[][2] = { /* Offset, Value */
60 {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
61 {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
62 {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
63 {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
64 {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
65 {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
66 {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
67 {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
68 {0}
69 };
70
71 int i, r;
72 ntfschar *uc;
73
74 uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar));
75 if (!uc)
76 return uc;
77 memset(uc, 0, default_upcase_len * sizeof(ntfschar));
78 for (i = 0; i < default_upcase_len; i++)
79 uc[i] = cpu_to_le16(i);
80 for (r = 0; uc_run_table[r][0]; r++)
81 for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
82 uc[i] = cpu_to_le16((le16_to_cpu(uc[i]) +
83 uc_run_table[r][2]));
84 for (r = 0; uc_dup_table[r][0]; r++)
85 for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
86 uc[i + 1] = cpu_to_le16(le16_to_cpu(uc[i + 1]) - 1);
87 for (r = 0; uc_word_table[r][0]; r++)
88 uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]);
89 return uc;
90}
diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h
new file mode 100644
index 000000000000..4b97fa8635a8
--- /dev/null
+++ b/fs/ntfs/volume.h
@@ -0,0 +1,171 @@
1/*
2 * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part
3 * of the Linux-NTFS project.
4 *
5 * Copyright (c) 2001-2004 Anton Altaparmakov
6 * Copyright (c) 2002 Richard Russon
7 *
8 * This program/include file is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as published
10 * by the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program/include file is distributed in the hope that it will be
14 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program (in the main directory of the Linux-NTFS
20 * distribution in the file COPYING); if not, write to the Free Software
21 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24#ifndef _LINUX_NTFS_VOLUME_H
25#define _LINUX_NTFS_VOLUME_H
26
27#include <linux/rwsem.h>
28
29#include "types.h"
30#include "layout.h"
31
32/*
33 * The NTFS in memory super block structure.
34 */
35typedef struct {
36 /*
37 * FIXME: Reorder to have commonly used together element within the
38 * same cache line, aiming at a cache line size of 32 bytes. Aim for
39 * 64 bytes for less commonly used together elements. Put most commonly
40 * used elements to front of structure. Obviously do this only when the
41 * structure has stabilized... (AIA)
42 */
43 /* Device specifics. */
44 struct super_block *sb; /* Pointer back to the super_block,
45 so we don't have to get the offset
46 every time. */
47 LCN nr_blocks; /* Number of NTFS_BLOCK_SIZE bytes
48 sized blocks on the device. */
49 /* Configuration provided by user at mount time. */
50 unsigned long flags; /* Miscellaneous flags, see below. */
51 uid_t uid; /* uid that files will be mounted as. */
52 gid_t gid; /* gid that files will be mounted as. */
53 mode_t fmask; /* The mask for file permissions. */
54 mode_t dmask; /* The mask for directory
55 permissions. */
56 u8 mft_zone_multiplier; /* Initial mft zone multiplier. */
57 u8 on_errors; /* What to do on file system errors. */
58 /* NTFS bootsector provided information. */
59 u16 sector_size; /* in bytes */
60 u8 sector_size_bits; /* log2(sector_size) */
61 u32 cluster_size; /* in bytes */
62 u32 cluster_size_mask; /* cluster_size - 1 */
63 u8 cluster_size_bits; /* log2(cluster_size) */
64 u32 mft_record_size; /* in bytes */
65 u32 mft_record_size_mask; /* mft_record_size - 1 */
66 u8 mft_record_size_bits; /* log2(mft_record_size) */
67 u32 index_record_size; /* in bytes */
68 u32 index_record_size_mask; /* index_record_size - 1 */
69 u8 index_record_size_bits; /* log2(index_record_size) */
70 LCN nr_clusters; /* Volume size in clusters == number of
71 bits in lcn bitmap. */
72 LCN mft_lcn; /* Cluster location of mft data. */
73 LCN mftmirr_lcn; /* Cluster location of copy of mft. */
74 u64 serial_no; /* The volume serial number. */
75 /* Mount specific NTFS information. */
76 u32 upcase_len; /* Number of entries in upcase[]. */
77 ntfschar *upcase; /* The upcase table. */
78
79 s32 attrdef_size; /* Size of the attribute definition
80 table in bytes. */
81 ATTR_DEF *attrdef; /* Table of attribute definitions.
82 Obtained from FILE_AttrDef. */
83
84#ifdef NTFS_RW
85 /* Variables used by the cluster and mft allocators. */
86 s64 mft_data_pos; /* Mft record number at which to
87 allocate the next mft record. */
88 LCN mft_zone_start; /* First cluster of the mft zone. */
89 LCN mft_zone_end; /* First cluster beyond the mft zone. */
90 LCN mft_zone_pos; /* Current position in the mft zone. */
91 LCN data1_zone_pos; /* Current position in the first data
92 zone. */
93 LCN data2_zone_pos; /* Current position in the second data
94 zone. */
95#endif /* NTFS_RW */
96
97 struct inode *mft_ino; /* The VFS inode of $MFT. */
98
99 struct inode *mftbmp_ino; /* Attribute inode for $MFT/$BITMAP. */
100 struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the
101 mft record bitmap ($MFT/$BITMAP). */
102#ifdef NTFS_RW
103 struct inode *mftmirr_ino; /* The VFS inode of $MFTMirr. */
104 int mftmirr_size; /* Size of mft mirror in mft records. */
105
106 struct inode *logfile_ino; /* The VFS inode of $LogFile. */
107#endif /* NTFS_RW */
108
109 struct inode *lcnbmp_ino; /* The VFS inode of $Bitmap. */
110 struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the
111 cluster bitmap ($Bitmap/$DATA). */
112
113 struct inode *vol_ino; /* The VFS inode of $Volume. */
114 VOLUME_FLAGS vol_flags; /* Volume flags. */
115 u8 major_ver; /* Ntfs major version of volume. */
116 u8 minor_ver; /* Ntfs minor version of volume. */
117
118 struct inode *root_ino; /* The VFS inode of the root
119 directory. */
120 struct inode *secure_ino; /* The VFS inode of $Secure (NTFS3.0+
121 only, otherwise NULL). */
122 struct inode *extend_ino; /* The VFS inode of $Extend (NTFS3.0+
123 only, otherwise NULL). */
124#ifdef NTFS_RW
125 /* $Quota stuff is NTFS3.0+ specific. Unused/NULL otherwise. */
126 struct inode *quota_ino; /* The VFS inode of $Quota. */
127 struct inode *quota_q_ino; /* Attribute inode for $Quota/$Q. */
128#endif /* NTFS_RW */
129 struct nls_table *nls_map;
130} ntfs_volume;
131
132/*
133 * Defined bits for the flags field in the ntfs_volume structure.
134 */
135typedef enum {
136 NV_Errors, /* 1: Volume has errors, prevent remount rw. */
137 NV_ShowSystemFiles, /* 1: Return system files in ntfs_readdir(). */
138 NV_CaseSensitive, /* 1: Treat file names as case sensitive and
139 create filenames in the POSIX namespace.
140 Otherwise be case insensitive and create
141 file names in WIN32 namespace. */
142 NV_LogFileEmpty, /* 1: $LogFile journal is empty. */
143 NV_QuotaOutOfDate, /* 1: $Quota is out of date. */
144} ntfs_volume_flags;
145
146/*
147 * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo()
148 * functions.
149 */
150#define NVOL_FNS(flag) \
151static inline int NVol##flag(ntfs_volume *vol) \
152{ \
153 return test_bit(NV_##flag, &(vol)->flags); \
154} \
155static inline void NVolSet##flag(ntfs_volume *vol) \
156{ \
157 set_bit(NV_##flag, &(vol)->flags); \
158} \
159static inline void NVolClear##flag(ntfs_volume *vol) \
160{ \
161 clear_bit(NV_##flag, &(vol)->flags); \
162}
163
164/* Emit the ntfs volume bitops functions. */
165NVOL_FNS(Errors)
166NVOL_FNS(ShowSystemFiles)
167NVOL_FNS(CaseSensitive)
168NVOL_FNS(LogFileEmpty)
169NVOL_FNS(QuotaOutOfDate)
170
171#endif /* _LINUX_NTFS_VOLUME_H */