44 files changed, 26034 insertions, 0 deletions
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
new file mode 100644
index 000000000000..1d2ad15f1533
--- /dev/null
+++ b/fs/ntfs/ChangeLog
@@ -0,0 +1,1350 @@
+ToDo/Notes:
+        - Find and fix bugs.
+        - Checkpoint or disable the user space journal ($UsnJrnl).
+        - In between ntfs_prepare/commit_write, need exclusion between
+          simultaneous file extensions. Need perhaps an NInoResizeUnderway()
+          flag which we can set in ntfs_prepare_write() and clear again in
+          ntfs_commit_write(). Just have to be careful in readpage/writepage,
+          as well as in truncate, that we play nice... We might need to have
+          a data_size field in the ntfs_inode to store the real attribute
+          length. Also need to be careful with initialized_size extention in
+          ntfs_prepare_write. Basically, just be _very_ careful in this code...
+          OTOH, perhaps i_sem, which is held accross generic_file_write is
+          sufficient for synchronisation here. We then just need to make sure
+          ntfs_readpage/writepage/truncate interoperate properly with us.
+          UPDATE: The above is all ok as it is due to i_sem held.  The only
+          thing that needs to be checked is ntfs_writepage() which does not
+          hold i_sem.  It cannot change i_size but it needs to cope with a
+          concurrent i_size change.
+        - Implement mft.c::sync_mft_mirror_umount().  We currently will just
+          leave the volume dirty on umount if the final iput(vol->mft_ino)
+          causes a write of any mirrored mft records due to the mft mirror
+          inode having been discarded already.  Whether this can actually ever
+          happen is unclear however so it is worth waiting until someone hits
+          the problem.
+        - Enable the code for setting the NT4 compatibility flag when we start
+          making NTFS 1.2 specific modifications.
+2.1.23-WIP
+        - Add printk rate limiting for ntfs_warning() and ntfs_error() when
+          compiled without debug.  This avoids a possible denial of service
+          attack.  Thanks to Carl-Daniel Hailfinger from SuSE for pointing this
+          out.
+2.1.22 - Many bug and race fixes and error handling improvements.
+        - Improve error handling in fs/ntfs/inode.c::ntfs_truncate().
+        - Change fs/ntfs/inode.c::ntfs_truncate() to return an error code
+          instead of void and provide a helper ntfs_truncate_vfs() for the
+          vfs ->truncate method.
+        - Add a new ntfs inode flag NInoTruncateFailed() and modify
+          fs/ntfs/inode.c::ntfs_truncate() to set and clear it appropriately.
+        - Fix min_size and max_size definitions in ATTR_DEF structure in
+          fs/ntfs/layout.h to be signed.
+        - Add attribute definition handling helpers to fs/ntfs/attrib.[hc]:
+          ntfs_attr_size_bounds_check(), ntfs_attr_can_be_non_resident(), and
+          ntfs_attr_can_be_resident(), which in turn use the new private helper
+          ntfs_attr_find_in_attrdef().
+        - In fs/ntfs/aops.c::mark_ntfs_record_dirty(), take the
+          mapping->private_lock around the dirtying of the buffer heads
+          analagous to the way it is done in __set_page_dirty_buffers().
+        - Ensure the mft record size does not exceed the PAGE_CACHE_SIZE at
+          mount time as this cannot work with the current implementation.
+        - Check for location of attribute name and improve error handling in
+          general in fs/ntfs/inode.c::ntfs_read_locked_inode() and friends.
+        - In fs/ntfs/aops.c::ntfs_writepage(), if the page is fully outside
+          i_size, i.e. race with truncate, invalidate the buffers on the page
+          so that they become freeable and hence the page does not leak.
+        - Remove unused function fs/ntfs/runlist.c::ntfs_rl_merge().  (Adrian
+          Bunk)
+        - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_find() that resulted in
+          a NULL pointer dereference in the error code path when a corrupt
+          attribute was found.  (Thanks to Domen Puncer for the bug report.)
+        - Add MODULE_VERSION() to fs/ntfs/super.c.
+        - Make several functions and variables static.  (Adrian Bunk)
+        - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() so it allocates
+          buffers for the page if they are not present and then marks the
+          buffers belonging to the ntfs record dirty.  This causes the buffers
+          to become busy and hence they are safe from removal until the page
+          has been written out.
+        - Fix stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find() in the
+          error handling code path that resulted in a BUG() due to trying to
+          unmap an extent mft record when the mapping of it had failed and it
+          thus was not mapped.  (Thanks to Ken MacFerrin for the bug report.)
+        - Drop the runlist lock after the vcn has been read in
+          fs/ntfs/lcnalloc.c::__ntfs_cluster_free().
+        - Rewrite handling of multi sector transfer errors.  We now do not set
+          PageError() when such errors are detected in the async i/o handler
+          fs/ntfs/aops.c::ntfs_end_buffer_async_read().  All users of mst
+          protected attributes now check the magic of each ntfs record as they
+          use it and act appropriately.  This has the effect of making errors
+          granular per ntfs record rather than per page which solves the case
+          where we cannot access any of the ntfs records in a page when a
+          single one of them had an mst error.  (Thanks to Ken MacFerrin for
+          the bug report.)
+        - Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date()
+          where we failed to release i_sem on the $Quota/$Q attribute inode.
+        - Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup().
+        - Add mapping of unmapped buffers to all remaining code paths, i.e.
+          fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(),
+          and write_mft_record_nolock().  From now on we require that the
+          complete runlist for the mft mirror is always mapped into memory.
+        - Add creation of buffers to fs/ntfs/mft.c::ntfs_sync_mft_mirror().
+        - Improve error handling in fs/ntfs/aops.c::ntfs_{read,write}_block().
+        - Cleanup fs/ntfs/aops.c::ntfs_{read,write}page() since we know that a
+          resident attribute will be smaller than a page which makes the code
+          simpler.  Also make the code more tolerant to concurrent ->truncate.
+2.1.21 - Fix some races and bugs, rewrite mft write code, add mft allocator.
+        - Implement extent mft record deallocation
+          fs/ntfs/mft.c::ntfs_extent_mft_record_free().
+        - Splitt runlist related functions off from attrib.[hc] to runlist.[hc].
+        - Add vol->mft_data_pos and initialize it at mount time.
+        - Rename init_runlist() to ntfs_init_runlist(), ntfs_vcn_to_lcn() to
+          ntfs_rl_vcn_to_lcn(), decompress_mapping_pairs() to
+          ntfs_mapping_pairs_decompress(), ntfs_merge_runlists() to
+          ntfs_runlists_merge() and adapt all callers.
+        - Add fs/ntfs/runlist.[hc]::ntfs_get_nr_significant_bytes(),
+          ntfs_get_size_for_mapping_pairs(), ntfs_write_significant_bytes(),
+          and ntfs_mapping_pairs_build(), adapted from libntfs.
+        - Make fs/ntfs/lcnalloc.c::ntfs_cluster_free_from_rl_nolock() not
+          static and add a declaration for it to lcnalloc.h.
+        - Add fs/ntfs/lcnalloc.h::ntfs_cluster_free_from_rl() which is a static
+          inline wrapper for ntfs_cluster_free_from_rl_nolock() which takes the
+          cluster bitmap lock for the duration of the call.
+        - Add fs/ntfs/attrib.[hc]::ntfs_attr_record_resize().
+        - Implement the equivalent of memset() for an ntfs attribute in
+          fs/ntfs/attrib.[hc]::ntfs_attr_set() and switch
+          fs/ntfs/logfile.c::ntfs_empty_logfile() to using it.
+        - Remove unnecessary casts from LCN_* constants.
+        - Implement fs/ntfs/runlist.c::ntfs_rl_truncate_nolock().
+        - Add MFT_RECORD_OLD as a copy of MFT_RECORD in fs/ntfs/layout.h and
+          change MFT_RECORD to contain the NTFS 3.1+ specific fields.
+        - Add a helper function fs/ntfs/aops.c::mark_ntfs_record_dirty() which
+          marks all buffers belonging to an ntfs record dirty, followed by
+          marking the page the ntfs record is in dirty and also marking the vfs
+          inode containing the ntfs record dirty (I_DIRTY_PAGES).
+        - Switch fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to using the
+          new helper fs/ntfs/aops.c::mark_ntfs_record_dirty() and remove the no
+          longer needed fs/ntfs/index.[hc]::__ntfs_index_entry_mark_dirty().
+        - Move ntfs_{un,}map_page() from ntfs.h to aops.h and fix resulting
+          include errors.
+        - Move the typedefs for runlist_element and runlist from types.h to
+          runlist.h and fix resulting include errors.
+        - Remove unused {__,}format_mft_record() from fs/ntfs/mft.c.
+        - Modify fs/ntfs/mft.c::__mark_mft_record_dirty() to use the helper
+          mark_ntfs_record_dirty() which also changes the behaviour in that we
+          now set the buffers belonging to the mft record dirty as well as the
+          page itself.
+        - Update fs/ntfs/mft.c::write_mft_record_nolock() and sync_mft_mirror()
+          to cope with the fact that there now are dirty buffers in mft pages.
+        - Update fs/ntfs/inode.c::ntfs_write_inode() to also use the helper
+          mark_ntfs_record_dirty() and thus to set the buffers belonging to the
+          mft record dirty as well as the page itself.
+        - Fix compiler warnings on x86-64 in fs/ntfs/dir.c.  (Randy Dunlap,
+          slightly modified by me)
+        - Add fs/ntfs/mft.c::try_map_mft_record() which fails with -EALREADY if
+          the mft record is already locked and otherwise behaves the same way
+          as fs/ntfs/mft.c::map_mft_record().
+        - Modify fs/ntfs/mft.c::write_mft_record_nolock() so that it only
+          writes the mft record if the buffers belonging to it are dirty.
+          Otherwise we assume that it was written out by other means already.
+        - Attempting to write outside initialized size is _not_ a bug so remove
+          the bug check from fs/ntfs/aops.c::ntfs_write_mst_block().  It is in
+          fact required to write outside initialized size when preparing to
+          extend the initialized size.
+        - Map the page instead of using page_address() before writing to it in
+          fs/ntfs/aops.c::ntfs_mft_writepage().
+        - Provide exclusion between opening an inode / mapping an mft record
+          and accessing the mft record in fs/ntfs/mft.c::ntfs_mft_writepage()
+          by setting the page not uptodate throughout ntfs_mft_writepage().
+        - Clear the page uptodate flag in fs/ntfs/aops.c::ntfs_write_mst_block()
+          to ensure noone can see the page whilst the mst fixups are applied.
+        - Add the helper fs/ntfs/mft.c::ntfs_may_write_mft_record() which
+          checks if an mft record may be written out safely obtaining any
+          necessary locks in the process.  This is used by
+          fs/ntfs/aops.c::ntfs_write_mst_block().
+        - Modify fs/ntfs/aops.c::ntfs_write_mst_block() to also work for
+          writing mft records and improve its error handling in the process.
+          Now if any of the records in the page fail to be written out, all
+          other records will be written out instead of aborting completely.
+        - Remove ntfs_mft_aops and update all users to use ntfs_mst_aops.
+        - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to set the
+          ntfs_mst_aops for all inodes which are NInoMstProtected() and
+          ntfs_aops for all other inodes.
+        - Rename fs/ntfs/mft.c::sync_mft_mirror{,_umount}() to
+          ntfs_sync_mft_mirror{,_umount}() and change their parameters so they
+          no longer require an ntfs inode to be present.  Update all callers.
+        - Cleanup the error handling in fs/ntfs/mft.c::ntfs_sync_mft_mirror().
+        - Clear the page uptodate flag in fs/ntfs/mft.c::ntfs_sync_mft_mirror()
+          to ensure noone can see the page whilst the mst fixups are applied.
+        - Remove the no longer needed fs/ntfs/mft.c::ntfs_mft_writepage() and
+          fs/ntfs/mft.c::try_map_mft_record().
+        - Fix callers of fs/ntfs/aops.c::mark_ntfs_record_dirty() to call it
+          with the ntfs inode which contains the page rather than the ntfs
+          inode the mft record of which is in the page.
+        - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by moving the
+          index inode bitmap inode release code from there to
+          fs/ntfs/inode.c::ntfs_clear_big_inode().  (Thanks to Christoph
+          Hellwig for spotting this.)
+        - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by taking the
+          inode semaphore around the code that sets ni->itype.index.bmp_ino to
+          NULL and reorganize the code to optimize it a bit.  (Thanks to
+          Christoph Hellwig for spotting this.)
+        - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() to no longer take the
+          ntfs inode as a parameter as this is confusing and misleading and the
+          needed ntfs inode is available via NTFS_I(page->mapping->host).
+          Adapt all callers to this change.
+        - Modify fs/ntfs/mft.c::write_mft_record_nolock() and
+          fs/ntfs/aops.c::ntfs_write_mst_block() to only check the dirty state
+          of the first buffer in a record and to take this as the ntfs record
+          dirty state.  We cannot look at the dirty state for subsequent
+          buffers because we might be racing with
+          fs/ntfs/aops.c::mark_ntfs_record_dirty().
+        - Move the static inline ntfs_init_big_inode() from fs/ntfs/inode.c to
+          inode.h and make fs/ntfs/inode.c::__ntfs_init_inode() non-static and
+          add a declaration for it to inode.h.  Fix some compilation issues
+          that resulted due to #includes and header file interdependencies.
+        - Simplify setup of i_mode in fs/ntfs/inode.c::ntfs_read_locked_inode().
+        - Add helpers fs/ntfs/layout.h::MK_MREF() and MK_LE_MREF().
+        - Modify fs/ntfs/mft.c::map_extent_mft_record() to only verify the mft
+          record sequence number if it is specified (i.e. not zero).
+        - Add fs/ntfs/mft.[hc]::ntfs_mft_record_alloc() and various helper
+          functions used by it.
+        - Update Documentation/filesystems/ntfs.txt with instructions on how to
+          use the Device-Mapper driver with NTFS ftdisk/LDM raid.  This removes
+          the linear raid problem with the Software RAID / MD driver when one
+          or more of the devices has an odd number of sectors.
+2.1.20 - Fix two stupid bugs introduced in 2.1.18 release.
+        - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_reinit_search_ctx()
+          where we did not clear ctx->al_entry but it was still set due to
+          changes in ntfs_attr_lookup() and ntfs_external_attr_find() in
+          particular.
+        - Fix another stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find()
+          where we forgot to unmap the extent mft record when we had finished
+          enumerating an attribute which caused a bug check to trigger when the
+          VFS calls ->clear_inode.
+2.1.19 - Many cleanups, improvements, and a minor bug fix.
+        - Update ->setattr (fs/ntfs/inode.c::ntfs_setattr()) to refuse to
+          change the uid, gid, and mode of an inode as we do not support NTFS
+          ACLs yet.
+        - Remove BKL use from ntfs_setattr() syncing up with the rest of the
+          kernel.
+        - Get rid of the ugly transparent union in fs/ntfs/dir.c::ntfs_readdir()
+          and ntfs_filldir() as per suggestion from Al Viro.
+        - Change '\0' and L'\0' to simply 0 as per advice from Linus Torvalds.
+        - Update ->truncate (fs/ntfs/inode.c::ntfs_truncate()) to check if the
+          inode size has changed and to only output an error if so.
+        - Rename fs/ntfs/attrib.h::attribute_value_length() to ntfs_attr_size().
+        - Add le{16,32,64} as well as sle{16,32,64} data types to
+          fs/ntfs/types.h.
+        - Change ntfschar to be le16 instead of u16 in fs/ntfs/types.h.
+        - Add le versions of VCN, LCN, and LSN called leVCN, leLCN, and leLSN,
+          respectively, to fs/ntfs/types.h.
+        - Update endianness conversion macros in fs/ntfs/endian.h to use the
+          new types as appropriate.
+        - Do proper type casting when using sle64_to_cpup() in fs/ntfs/dir.c
+          and index.c.
+        - Add leMFT_REF data type to fs/ntfs/layout.h.
+        - Update all NTFS header files with the new little endian data types.
+          Affected files are fs/ntfs/layout.h, logfile.h, and time.h.
+        - Do proper type casting when using ntfs_is_*_recordp() in
+          fs/ntfs/logfile.c, mft.c, and super.c. 
+        - Fix all the sparse bitwise warnings.  Had to change all the typedef
+          enums storing little endian values to simple enums plus a typedef for
+          the datatype to make sparse happy.
+        - Fix a bug found by the new sparse bitwise warnings where the default
+          upcase table was defined as a pointer to wchar_t rather than ntfschar
+          in fs/ntfs/ntfs.h and super.c.
+        - Change {const_,}cpu_to_le{16,32}(0) to just 0 as suggested by Al Viro.
+2.1.18 - Fix scheduling latencies at mount time as well as an endianness bug.
+        - Remove vol->nr_mft_records as it was pretty meaningless and optimize
+          the calculation of total/free inodes as used by statfs().
+        - Fix scheduling latencies in ntfs_fill_super() by dropping the BKL
+          because the code itself is using the ntfs_lock semaphore which
+          provides safe locking.  (Ingo Molnar)
+        - Fix a potential bug in fs/ntfs/mft.c::map_extent_mft_record() that
+          could occur in the future for when we start closing/freeing extent
+          inodes if we don't set base_ni->ext.extent_ntfs_inos to NULL after
+          we free it.
+        - Rename {find,lookup}_attr() to ntfs_attr_{find,lookup}() as well as
+          find_external_attr() to ntfs_external_attr_find() to cleanup the
+          namespace a bit and to be more consistent with libntfs.
+        - Rename {{re,}init,get,put}_attr_search_ctx() to
+          ntfs_attr_{{re,}init,get,put}_search_ctx() as well as the type
+          attr_search_context to ntfs_attr_search_ctx.
+        - Force use of ntfs_attr_find() in ntfs_attr_lookup() when searching
+          for the attribute list attribute itself.
+        - Fix endianness bug in ntfs_external_attr_find().
+        - Change ntfs_{external_,}attr_find() to return 0 on success, -ENOENT
+          if the attribute is not found, and -EIO on real error.  In the case
+          of -ENOENT, the search context is updated to describe the attribute
+          before which the attribute being searched for would need to be
+          inserted if such an action were to be desired and in the case of
+          ntfs_external_attr_find() the search context is also updated to
+          indicate the attribute list entry before which the attribute list
+          entry of the attribute being searched for would need to be inserted
+          if such an action were to be desired.  Also make ntfs_find_attr()
+          static and remove its prototype from attrib.h as it is not used
+          anywhere other than attrib.c.  Update ntfs_attr_lookup() and all
+          callers of ntfs_{external,}attr_{find,lookup}() for the new return
+          values.
+        - Minor cleanup of fs/ntfs/inode.c::ntfs_init_locked_inode().
+2.1.17 - Fix bugs in mount time error code paths and other updates.
+        - Implement bitmap modification code (fs/ntfs/bitmap.[hc]).  This
+          includes functions to set/clear a single bit or a run of bits.
+        - Add fs/ntfs/attrib.[hc]::ntfs_find_vcn() which returns the locked
+          runlist element containing a particular vcn.  It also takes care of
+          mapping any needed runlist fragments.
+        - Implement cluster (de-)allocation code (fs/ntfs/lcnalloc.[hc]).
+        - Load attribute definition table from $AttrDef at mount time.
+        - Fix bugs in mount time error code paths involving (de)allocation of
+          the default and volume upcase tables.
+        - Remove ntfs_nr_mounts as it is no longer used.
+2.1.16 - Implement access time updates, file sync, async io, and read/writev.
+        - Add support for readv/writev and aio_read/aio_write (fs/ntfs/file.c).
+          This is done by setting the appropriate file operations pointers to
+          the generic helper functions provided by mm/filemap.c.
+        - Implement fsync, fdatasync, and msync both for files (fs/ntfs/file.c)
+          and directories (fs/ntfs/dir.c).
+        - Add support for {a,m,c}time updates to inode.c::ntfs_write_inode().
+          Note, except for the root directory and any other system files opened
+          by the user, the system files will not have their access times
+          updated as they are only accessed at the inode level an hence the
+          file level functions which cause the times to be updated are never
+          invoked.
+2.1.15 - Invalidate quotas when (re)mounting read-write.
+        - Add new element itype.index.collation_rule to the ntfs inode
+          structure and set it appropriately in ntfs_read_locked_inode().
+        - Implement a new inode type "index" to allow efficient access to the
+          indices found in various system files and adapt inode handling
+          accordingly (fs/ntfs/inode.[hc]).  An index inode is essentially an
+          attribute inode (NInoAttr() is true) with an attribute type of
+          AT_INDEX_ALLOCATION.  As such, it is no longer allowed to call
+          ntfs_attr_iget() with an attribute type of AT_INDEX_ALLOCATION as
+          there would be no way to distinguish between normal attribute inodes
+          and index inodes.  The function to obtain an index inode is
+          ntfs_index_iget() and it uses the helper function
+          ntfs_read_locked_index_inode().  Note, we do not overload
+          ntfs_attr_iget() as indices consist of multiple attributes so using
+          ntfs_attr_iget() to obtain an index inode would be confusing.
+        - Ensure that there is no overflow when doing page->index <<
+          PAGE_CACHE_SHIFT by casting page->index to s64 in fs/ntfs/aops.c.
+        - Use atomic kmap instead of kmap() in fs/ntfs/aops.c::ntfs_read_page()
+          and ntfs_read_block().
+        - Use case sensitive attribute lookups instead of case insensitive ones.
+        - Lock all page cache pages belonging to mst protected attributes while
+          accessing them to ensure we never see corrupt data while the page is
+          under writeout.
+        - Add framework for generic ntfs collation (fs/ntfs/collation.[hc]).
+          We have ntfs_is_collation_rule_supported() to check if the collation
+          rule you want to use is supported and ntfs_collation() which actually
+          collates two data items.  We currently only support COLLATION_BINARY
+          and COLLATION_NTOFS_ULONG but support for other collation rules will
+          be added as the need arises.
+        - Add a new type, ntfs_index_context, to allow retrieval of an index
+          entry using the corresponding index key.  To get an index context,
+          use ntfs_index_ctx_get() and to release it, use ntfs_index_ctx_put().
+          This also adds a new slab cache for the index contexts.  To lookup a
+          key in an index inode, use ntfs_index_lookup().  After modifying an
+          index entry, call ntfs_index_entry_flush_dcache_page() followed by
+          ntfs_index_entry_mark_dirty() to ensure the changes are written out
+          to disk.  For details see fs/ntfs/index.[hc].  Note, at present, if
+          an index entry is in the index allocation attribute rather than the
+          index root attribute it will not be written out (you will get a
+          warning message about discarded changes instead).
+        - Load the quota file ($Quota) and check if quota tracking is enabled
+          and if so, mark the quotas out of date.  This causes windows to
+          rescan the volume on boot and update all quota entries.
+        - Add a set_page_dirty address space operation for ntfs_m[fs]t_aops.
+          It is simply set to __set_page_dirty_nobuffers() to make sure that
+          running set_page_dirty() on a page containing mft/ntfs records will
+          not affect the dirty state of the page buffers.
+        - Add fs/ntfs/index.c::__ntfs_index_entry_mark_dirty() which sets all
+          buffers that are inside the ntfs record in the page dirty after which
+          it sets the page dirty.  This allows ->writepage to only write the
+          dirty index records rather than having to write all the records in
+          the page.  Modify fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to
+          use this rather than __set_page_dirty_nobuffers().
+        - Implement fs/ntfs/aops.c::ntfs_write_mst_block() which enables the
+          writing of page cache pages belonging to mst protected attributes
+          like the index allocation attribute in directory indices and other
+          indices like $Quota/$Q, etc.  This means that the quota is now marked
+          out of date on all volumes rather than only on ones where the quota
+          defaults entry is in the index root attribute of the $Quota/$Q index.
+2.1.14 - Fix an NFSd caused deadlock reported by several users.
+        - Modify fs/ntfs/ntfs_readdir() to copy the index root attribute value
+          to a buffer so that we can put the search context and unmap the mft
+          record before calling the filldir() callback.  We need to do this
+          because of NFSd which calls ->lookup() from its filldir callback()
+          and this causes NTFS to deadlock as ntfs_lookup() maps the mft record
+          of the directory and since ntfs_readdir() has got it mapped already
+          ntfs_lookup() deadlocks.
+2.1.13 - Enable overwriting of resident files and housekeeping of system files.
+        - Implement writing of mft records (fs/ntfs/mft.[hc]), which includes
+          keeping the mft mirror in sync with the mft when mirrored mft records
+          are written.  The functions are write_mft_record{,_nolock}().  The
+          implementation is quite rudimentary for now with lots of things not
+          implemented yet but I am not sure any of them can actually occur so
+          I will wait for people to hit each one and only then implement it.
+        - Commit open system inodes at umount time.  This should make it
+          virtually impossible for sync_mft_mirror_umount() to ever be needed.
+        - Implement ->write_inode (fs/ntfs/inode.c::ntfs_write_inode()) for the
+          ntfs super operations.  This gives us inode writing via the VFS inode
+          dirty code paths.  Note:  Access time updates are not implemented yet.
+        - Implement fs/ntfs/mft.[hc]::{,__}mark_mft_record_dirty() and make
+          fs/ntfs/aops.c::ntfs_writepage() and ntfs_commit_write() use it, thus
+          finally enabling resident file overwrite!  (-8  This also includes a
+          placeholder for ->writepage (ntfs_mft_writepage()), which for now
+          just redirties the page and returns.  Also, at umount time, we for
+          now throw away all mft data page cache pages after the last call to
+          ntfs_commit_inode() in the hope that all inodes will have been
+          written out by then and hence no dirty (meta)data will be lost.  We
+          also check for this case and emit an error message telling the user
+          to run chkdsk.
+        - Use set_page_writeback() and end_page_writeback() in the resident
+          attribute code path of fs/ntfs/aops.c::ntfs_writepage() otherwise
+          the radix-tree tag PAGECACHE_TAG_DIRTY remains set even though the
+          page is clean.
+        - Implement ntfs_mft_writepage() so it now checks if any of the mft
+          records in the page are dirty and if so redirties the page and
+          returns.  Otherwise it just returns (after doing set_page_writeback(),
+          unlock_page(), end_page_writeback() or the radix-tree tag
+          PAGECACHE_TAG_DIRTY remains set even though the page is clean), thus
+          alowing the VM to do with the page as it pleases.  Also, at umount
+          time, now only throw away dirty mft (meta)data pages if dirty inodes
+          are present and ask the user to email us if they see this happening.
+        - Add functions ntfs_{clear,set}_volume_flags(), to modify the volume
+          information flags (fs/ntfs/super.c).
+        - Mark the volume dirty when (re)mounting read-write and mark it clean
+          when unmounting or remounting read-only.  If any volume errors are
+          found, the volume is left marked dirty to force chkdsk to run.
+        - Add code to set the NT4 compatibility flag when (re)mounting
+          read-write for newer NTFS versions but leave it commented out for now
+          since we do not make any modifications that are NTFS 1.2 specific yet
+          and since setting this flag breaks Captive-NTFS which is not nice.
+          This code must be enabled once we start writing NTFS 1.2 specific
+          changes otherwise Windows NTFS driver might crash / cause corruption.
+2.1.12 - Fix the second fix to the decompression engine and some cleanups.
+        - Add a new address space operations struct, ntfs_mst_aops, for mst
+          protected attributes.  This is because the default ntfs_aops do not
+          make sense with mst protected data and were they to write anything to
+          such an attribute they would cause data corruption so we provide
+          ntfs_mst_aops which does not have any write related operations set.
+        - Cleanup dirty ntfs inode handling (fs/ntfs/inode.[hc]) which also
+          includes an adapted ntfs_commit_inode() and an implementation of
+          ntfs_write_inode() which for now just cleans dirty inodes without
+          writing them (it does emit a warning that this is happening).
+        - Undo the second decompression engine fix (see 2.1.9 release ChangeLog
+          entry) as it was only fixing a theoretical bug but at the same time
+          it badly broke the handling of sparse and uncompressed compression
+          blocks.
+2.1.11 - Driver internal cleanups.
+        - Only build logfile.o if building the driver with read-write support.
+        - Really final white space cleanups.
+        - Use generic_ffs() instead of ffs() in logfile.c which allows the
+          log_page_size variable to be optimized by gcc into a constant.
+        - Rename uchar_t to ntfschar everywhere as uchar_t is unsigned 1-byte
+          char as defined by POSIX and as found on some systems.
+2.1.10 - Force read-only (re)mounting of volumes with unsupported volume flags.
+        - Finish off the white space cleanups (remove trailing spaces, etc).
+        - Clean up ntfs_fill_super() and ntfs_read_inode_mount() by removing
+          the kludges around the first iget().  Instead of (re)setting ->s_op
+          we have the $MFT inode set up by explicit new_inode() / set ->i_ino /
+          insert_inode_hash() / call ntfs_read_inode_mount() directly.  This
+          kills the need for second super_operations and allows to return error
+          from ntfs_read_inode_mount() without resorting to ugly "poisoning"
+          tricks.  (Al Viro)
+        - Force read-only (re)mounting if any of the following bits are set in
+          the volume information flags:
+                VOLUME_IS_DIRTY, VOLUME_RESIZE_LOG_FILE,
+                VOLUME_UPGRADE_ON_MOUNT, VOLUME_DELETE_USN_UNDERWAY,
+                VOLUME_REPAIR_OBJECT_ID, VOLUME_MODIFIED_BY_CHKDSK
+          To make this easier we define VOLUME_MUST_MOUNT_RO_MASK with all the
+          above bits set so the test is made easy.
+2.1.9 - Fix two bugs in decompression engine.
+        - Fix a bug where we would not always detect that we have reached the
+          end of a compression block because we were ending at minus one byte
+          which is effectively the same as being at the end.  The fix is to
+          check whether the uncompressed buffer has been fully filled and if so
+          we assume we have reached the end of the compression block.  A big
+          thank you to Marcin Gibuła for the bug report, the assistance in
+          tracking down the bug and testing the fix.
+        - Fix a possible bug where when a compressed read is truncated to the
+          end of the file, the offset inside the last page was not truncated.
+2.1.8 - Handle $MFT mirror and $LogFile, improve time handling, and cleanups.
+        - Use get_bh() instead of manual atomic_inc() in fs/ntfs/compress.c.
+        - Modify fs/ntfs/time.c::ntfs2utc(), get_current_ntfs_time(), and
+          utc2ntfs() to work with struct timespec instead of time_t on the
+          Linux UTC time side thus preserving the full precision of the NTFS
+          time and only loosing up to 99 nano-seconds in the Linux UTC time.
+        - Move fs/ntfs/time.c to fs/ntfs/time.h and make the time functions
+          static inline.
+        - Remove unused ntfs_dirty_inode().
+        - Cleanup super operations declaration in fs/ntfs/super.c.
+        - Wrap flush_dcache_mft_record_page() in #ifdef NTFS_RW.
+        - Add NInoTestSetFoo() and NInoTestClearFoo() macro magic to
+          fs/ntfs/inode.h and use it to declare NInoTest{Set,Clear}Dirty.
+        - Move typedefs for ntfs_attr and test_t from fs/ntfs/inode.c to
+          fs/ntfs/inode.h so they can be used elsewhere.
+        - Determine the mft mirror size as the number of mirrored mft records
+          and store it in ntfs_volume->mftmirr_size (fs/ntfs/super.c).
+        - Load the mft mirror at mount time and compare the mft records stored
+          in it to the ones in the mft.  Force a read-only mount if the two do
+          not match (fs/ntfs/super.c).
+        - Fix type casting related warnings on 64-bit architectures.  Thanks
+          to Meelis Roos for reporting them.
+        - Move %L to %ll as %L is floating point and %ll is integer which is
+          what we want.
+        - Read the journal ($LogFile) and determine if the volume has been
+          shutdown cleanly and force a read-only mount if not (fs/ntfs/super.c
+          and fs/ntfs/logfile.c).  This is a little bit of a crude check in
+          that we only look at the restart areas and not at the actual log
+          records so that there will be a very small number of cases where we
+          think that a volume is dirty when in fact it is clean.  This should
+          only affect volumes that have not been shutdown cleanly and did not
+          have any pending, non-check-pointed i/o.
+        - If the $LogFile indicates a clean shutdown and a read-write (re)mount
+          is requested, empty $LogFile by overwriting it with 0xff bytes to
+          ensure that Windows cannot cause data corruption by replaying a stale
+          journal after Linux has written to the volume.
+2.1.7 - Enable NFS exporting of mounted NTFS volumes.
+        - Set i_generation in the VFS inode from the seq_no of the NTFS inode.
+        - Make ntfs_lookup() NFS export safe, i.e. use d_splice_alias(), etc.
+        - Implement ->get_dentry() in fs/ntfs/namei.c::ntfs_get_dentry() as the
+          default doesn't allow inode number 0 which is a valid inode on NTFS
+          and even if it did allow that it uses iget() instead of ntfs_iget()
+          which makes it useless for us.
+        - Implement ->get_parent() in fs/ntfs/namei.c::ntfs_get_parent() as the
+          default just returns -EACCES which is not very useful.
+        - Define export operations (->s_export_op) for NTFS (ntfs_export_ops)
+          and set them up in the super block at mount time (super.c) this
+          allows mounted NTFS volumes to be exported via NFS.
+        - Add missing return -EOPNOTSUPP; in
+          fs/ntfs/aops.c::ntfs_commit_nonresident_write().
+        - Enforce no atime and no dir atime updates at mount/remount time as
+          they are not implemented yet anyway.
+        - Move a few assignments in fs/ntfs/attrib.c::load_attribute_list() to
+          after a NULL check.  Thanks to Dave Jones for pointing this out.
+2.1.6 - Fix minor bug in handling of compressed directories.
+        - Fix bug in handling of compressed directories.  A compressed
+          directory is not really compressed so when we set the ->i_blocks
+          field of a compressed directory inode we were setting it from the
+          non-existing field ni->itype.compressed.size which gave random
+          results...  For directories we now always use ni->allocated_size.
+2.1.5 - Fix minor bug in attribute list attribute handling.
+        - Fix bug in attribute list handling.  Actually it is not as much a bug
+          as too much protection in that we were not allowing attribute lists
+          which waste space on disk while Windows XP clearly allows it and in
+          fact creates such attribute lists so our driver was failing.
+        - Update NTFS documentation ready for 2.6 kernel release.
+2.1.4 - Reduce compiler requirements.
+        - Remove all uses of unnamed structs and unions in the driver to make
+          old and newer gcc versions happy. Makes it a bit uglier IMO but at
+          least people will stop hassling me about it.
+2.1.3 - Important bug fixes in corner cases.
+        - super.c::parse_ntfs_boot_sector(): Correct the check for 64-bit
+          clusters. (Philipp Thomas)
+        - attrib.c::load_attribute_list(): Fix bug when initialized_size is a
+          multiple of the block_size but not the cluster size. (Szabolcs
+          Szakacsits <szaka@sienet.hu>)
+2.1.2 - Important bug fixes aleviating the hangs in statfs.
+        - Fix buggy free cluster and free inode determination logic.
+2.1.1 - Minor updates.
+        - Add handling for initialized_size != data_size in compressed files.
+        - Reduce function local stack usage from 0x3d4 bytes to just noise in
+          fs/ntfs/upcase.c. (Randy Dunlap <rddunlap@osdl.ord>)
+        - Remove compiler warnings for newer gcc.
+        - Pages are no longer kmapped by mm/filemap.c::generic_file_write()
+          around calls to ->{prepare,commit}_write.  Adapt NTFS appropriately
+          in fs/ntfs/aops.c::ntfs_prepare_nonresident_write() by using
+          kmap_atomic(KM_USER0).
+2.1.0 - First steps towards write support: implement file overwrite.
+        - Add configuration option for developmental write support with an
+          appropriately scary configuration help text.
+        - Initial implementation of fs/ntfs/aops.c::ntfs_writepage() and its
+          helper fs/ntfs/aops.c::ntfs_write_block(). This enables mmap(2) based
+          overwriting of existing files on ntfs. Note: Resident files are
+          only written into memory, and not written out to disk at present, so
+          avoid writing to files smaller than about 1kiB.
+        - Initial implementation of fs/ntfs/aops.c::ntfs_prepare_write(), its
+          helper fs/ntfs/aops.c::ntfs_prepare_nonresident_write() and their
+          counterparts, fs/ntfs/aops.c::ntfs_commit_write(), and
+          fs/ntfs/aops.c::ntfs_commit_nonresident_write(), respectively. Also,
+          add generic_file_write() to the ntfs file operations (fs/ntfs/file.c).
+          This enables write(2) based overwriting of existing files on ntfs.
+          Note: As with mmap(2) based overwriting, resident files are only
+          written into memory, and not written out to disk at present, so avoid
+          writing to files smaller than about 1kiB.
+        - Implement ->truncate (fs/ntfs/inode.c::ntfs_truncate()) and
+          ->setattr() (fs/ntfs/inode.c::ntfs_setattr()) inode operations for
+          files with the purpose of intercepting and aborting all i_size
+          changes which we do not support yet. ntfs_truncate() actually only
+          emits a warning message but AFAICS our interception of i_size changes
+          elsewhere means ntfs_truncate() never gets called for i_size changes.
+          It is only called from generic_file_write() when we fail in
+          ntfs_prepare_{,nonresident_}write() in order to discard any
+          instantiated buffers beyond i_size. Thus i_size is not actually
+          changed so our warning message is enough. Unfortunately it is not
+          possible to easily determine if i_size is being changed or not hence
+          we just emit an appropriately worded error message.
+2.0.25 - Small bug fixes and cleanups.
+        - Unlock the page in an out of memory error code path in
+          fs/ntfs/aops.c::ntfs_read_block().
+        - If fs/ntfs/aops.c::ntfs_read_page() is called on an uptodate page,
+          just unlock the page and return. (This can happen due to ->writepage
+          clearing PageUptodate() during write out of MstProtected()
+          attributes.
+        - Remove leaked write code again.
+2.0.24 - Cleanups.
+        - Treat BUG_ON() as ASSERT() not VERIFY(), i.e. do not use side effects
+          inside BUG_ON(). (Adam J. Richter)
+        - Split logical OR expressions inside BUG_ON() into individual BUG_ON()
+          calls for improved debugging. (Adam J. Richter)
+        - Add errors flag to the ntfs volume state, accessed via
+          NVol{,Set,Clear}Errors(vol).
+        - Do not allow read-write remounts of read-only volumes with errors.
+        - Clarify comment for ntfs file operation sendfile which was added by
+          Christoph Hellwig a while ago (just using generic_file_sendfile())
+          to say that ntfs ->sendfile is only used for the case where the
+          source data is on the ntfs partition and the destination is
+          somewhere else, i.e. nothing we need to concern ourselves with.
+        - Add generic_file_write() as our ntfs file write operation.
+2.0.23 - Major bug fixes (races, deadlocks, non-i386 architectures).
+        - Massive internal locking changes to mft record locking. Fixes lock
+          recursion and replaces the mrec_lock read/write semaphore with a
+          mutex. Also removes the now superfluous mft_count. This fixes several
+          race conditions and deadlocks, especially in the future write code.
+        - Fix ntfs over loopback for compressed files by adding an
+          optimization barrier. (gcc was screwing up otherwise ?)
+        - Miscellaneous cleanups all over the code and a fix or two in error
+          handling code paths.
+        Thanks go to Christoph Hellwig for pointing out the following two:
+        - Remove now unused function fs/ntfs/malloc.h::vmalloc_nofs().
+        - Fix ntfs_free() for ia64 and parisc by checking for VMALLOC_END, too.
+2.0.22 - Cleanups, mainly to ntfs_readdir(), and use C99 initializers.
+        - Change fs/ntfs/dir.c::ntfs_reddir() to only read/write ->f_pos once
+          at entry/exit respectively.
+        - Use C99 initializers for structures.
+        - Remove unused variable blocks from fs/ntfs/aops.c::ntfs_read_block().
+2.0.21 - Check for, and refuse to work with too large files/directories/volumes.
+        - Limit volume size at mount time to 2TiB on architectures where
+          unsigned long is 32-bits (fs/ntfs/super.c::parse_ntfs_boot_sector()).
+          This is the most we can do without overflowing the 32-bit limit of
+          the block device size imposed on us by sb_bread() and sb_getblk()
+          for the time being.
+        - Limit file/directory size at open() time to 16TiB on architectures
+          where unsigned long is 32-bits (fs/ntfs/file.c::ntfs_file_open() and
+          fs/ntfs/dir.c::ntfs_dir_open()). This is the most we can do without
+          overflowing the page cache page index.
+2.0.20 - Support non-resident directory index bitmaps, fix page leak in readdir.
+        - Move the directory index bitmap to use an attribute inode instead of
+          having special fields for it inside the ntfs inode structure. This
+          means that the index bitmaps now use the page cache for i/o, too,
+          and also as a side effect we get support for non-resident index
+          bitmaps for free.
+        - Simplify/cleanup error handling in fs/ntfs/dir.c::ntfs_readdir() and
+          fix a page leak that manifested itself in some cases.
+        - Add fs/ntfs/inode.c::ntfs_put_inode(), which we need to release the
+          index bitmap inode on the final iput().
+2.0.19 - Fix race condition, improvements, and optimizations in i/o interface.
+        - Apply block optimization added to fs/ntfs/aops.c::ntfs_read_block()
+          to fs/ntfs/compress.c::ntfs_file_read_compressed_block() as well.
+        - Drop the "file" from ntfs_file_read_compressed_block().
+        - Rename fs/ntfs/aops.c::ntfs_enb_buffer_read_async() to
+          ntfs_end_buffer_async_read() (more like the fs/buffer.c counterpart).
+        - Update ntfs_end_buffer_async_read() with the improved logic from
+          its updated counterpart fs/buffer.c::end_buffer_async_read(). Apply
+          further logic improvements to better determine when we set PageError.
+        - Update submission of buffers in fs/ntfs/aops.c::ntfs_read_block() to
+          check for the buffers being uptodate first in line with the updated
+          fs/buffer.c::block_read_full_page(). This plugs a small race
+          condition.
+2.0.18 - Fix race condition in reading of compressed files.
+        - There was a narrow window between checking a buffer head for being
+          uptodate and locking it in ntfs_file_read_compressed_block(). We now
+          lock the buffer and then check whether it is uptodate or not.
+2.0.17 - Cleanups and optimizations - shrinking the ToDo list.
+        - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to return an error
+          code and update callers, i.e. ntfs_iget(), to pass that error code
+          up instead of just using -EIO.
+        - Modifications to super.c to ensure that both mount and remount
+          cannot set any write related options when the driver is compiled
+          read-only.
+        - Optimize block resolution in fs/ntfs/aops.c::ntfs_read_block() to
+          cache the current runlist element. This should improve performance
+          when reading very large and/or very fragmented data.
+2.0.16 - Convert access to $MFT/$BITMAP to attribute inode API.
+        - Fix a stupid bug introduced in 2.0.15 where we were unmapping the
+          wrong inode in fs/ntfs/inode.c::ntfs_attr_iget().
+        - Fix debugging check in fs/ntfs/aops.c::ntfs_read_block().
+        - Convert $MFT/$BITMAP access to attribute inode API and remove all
+          remnants of the ugly mftbmp address space and operations hack. This
+          means we finally have only one readpage function as well as only one
+          async io completion handler. Yey! The mft bitmap is now just an
+          attribute inode and is accessed from vol->mftbmp_ino just as if it
+          were a normal file. Fake inodes rule. (-:
+2.0.15 - Fake inodes based attribute i/o via the pagecache, fixes and cleanups.
+        - Fix silly bug in fs/ntfs/super.c::parse_options() which was causing
+          remounts to fail when the partition had an entry in /etc/fstab and
+          the entry specified the nls= option.
+        - Apply same macro magic used in fs/ntfs/inode.h to fs/ntfs/volume.h to
+          expand all the helper functions NVolFoo(), NVolSetFoo(), and
+          NVolClearFoo().
+        - Move copyright statement from driver initialisation message to
+          module description (fs/super.c). This makes the initialisation
+          message fit on one line and fits in better with rest of kernel.
+        - Update fs/ntfs/attrib.c::map_run_list() to work on both real and
+          attribute inodes, and both for files and directories.
+        - Implement fake attribute inodes allowing all attribute i/o to go via
+          the page cache and to use all the normal vfs/mm functionality:
+          - Add ntfs_attr_iget() and its helper ntfs_read_locked_attr_inode()
+            to fs/ntfs/inode.c.
+          - Add needed cleanup code to ntfs_clear_big_inode().
+        - Merge address space operations for files and directories (aops.c),
+          now just have ntfs_aops:
+          - Rename:
+                end_buffer_read_attr_async() -> ntfs_end_buffer_read_async(),
+                ntfs_attr_read_block()       -> ntfs_read_block(),
+                ntfs_file_read_page()        -> ntfs_readpage().
+          - Rewrite fs/ntfs/aops.c::ntfs_readpage() to work on both real and
+            attribute inodes, and both for files and directories.
+          - Remove obsolete fs/ntfs/aops.c::ntfs_mst_readpage().
+2.0.14 - Run list merging code cleanup, minor locking changes, typo fixes.
+        - Change fs/ntfs/super.c::ntfs_statfs() to not rely on BKL by moving
+          the locking out of super.c::get_nr_free_mft_records() and taking and
+          dropping the mftbmp_lock rw_semaphore in ntfs_statfs() itself.
+        - Bring attribute runlist merging code (fs/ntfs/attrib.c) in sync with
+          current userspace ntfs library code. This means that if a merge
+          fails the original runlists are always left unmodified instead of
+          being silently corrupted.
+        - Misc typo fixes.
+2.0.13 - Use iget5_locked() in preparation for fake inodes and small cleanups.
+        - Remove nr_mft_bits and the now superfluous union with nr_mft_records
+          from ntfs_volume structure.
+        - Remove nr_lcn_bits and the now superfluous union with nr_clusters
+          from ntfs_volume structure.
+        - Use iget5_locked() and friends instead of conventional iget(). Wrap
+          the call in fs/ntfs/inode.c::ntfs_iget() and update callers of iget()
+          to use ntfs_iget(). Leave only one iget() call at mount time so we
+          don't need an ntfs_iget_mount().
+        - Change fs/ntfs/inode.c::ntfs_new_extent_inode() to take mft_no as an
+          additional argument.
+2.0.12 - Initial cleanup of address space operations following 2.0.11 changes.
+        - Merge fs/ntfs/aops.c::end_buffer_read_mst_async() and
+          fs/ntfs/aops.c::end_buffer_read_file_async() into one function
+          fs/ntfs/aops.c::end_buffer_read_attr_async() using NInoMstProtected()
+          to determine whether to apply mst fixups or not.
+        - Above change allows merging fs/ntfs/aops.c::ntfs_file_read_block()
+          and fs/ntfs/aops.c::ntfs_mst_readpage() into one function
+          fs/ntfs/aops.c::ntfs_attr_read_block(). Also, create a tiny wrapper
+          fs/ntfs/aops.c::ntfs_mst_readpage() to transform the parameters from
+          the VFS readpage function prototype to the ntfs_attr_read_block()
+          function prototype.
+2.0.11 - Initial preparations for fake inode based attribute i/o.
+        - Move definition of ntfs_inode_state_bits to fs/ntfs/inode.h and
+          do some macro magic (adapted from include/linux/buffer_head.h) to
+          expand all the helper functions NInoFoo(), NInoSetFoo(), and
+          NInoClearFoo().
+        - Add new flag to ntfs_inode_state_bits: NI_Sparse.
+        - Add new fields to ntfs_inode structure to allow use of fake inodes
+          for attribute i/o: type, name, name_len. Also add new state bits:
+          NI_Attr, which, if set, indicates the inode is a fake inode, and
+          NI_MstProtected, which, if set, indicates the attribute uses multi
+          sector transfer protection, i.e. fixups need to be applied after
+          reads and before/after writes.
+        - Rename fs/ntfs/inode.c::ntfs_{new,clear,destroy}_inode() to
+          ntfs_{new,clear,destroy}_extent_inode() and update callers.
+        - Use ntfs_clear_extent_inode() in fs/ntfs/inode.c::__ntfs_clear_inode()
+          instead of ntfs_destroy_extent_inode().
+        - Cleanup memory deallocations in {__,}ntfs_clear_{,big_}inode().
+        - Make all operations on ntfs inode state bits use the NIno* functions.
+        - Set up the new ntfs inode fields and state bits in
+          fs/ntfs/inode.c::ntfs_read_inode() and add appropriate cleanup of
+          allocated memory to __ntfs_clear_inode().
+        - Cleanup ntfs_inode structure a bit for better ordering of elements
+          w.r.t. their size to allow better packing of the structure in memory.
+2.0.10 - There can only be 2^32 - 1 inodes on an NTFS volume.
+        - Add check at mount time to verify that the number of inodes on the
+          volume does not exceed 2^32 - 1, which is the maximum allowed for
+          NTFS according to Microsoft.
+        - Change mft_no member of ntfs_inode structure to be unsigned long.
+          Update all users. This makes ntfs_inode->mft_no just a copy of struct
+          inode->i_ino. But we can't just always use struct inode->i_ino and
+          remove mft_no because extent inodes do not have an attached struct
+          inode.
+2.0.9 - Decompression engine now uses a single buffer and other cleanups.
+        - Change decompression engine to use a single buffer protected by a
+          spin lock instead of per-CPU buffers. (Rusty Russell)
+        - Do not update cb_pos when handling a partial final page during
+          decompression of a sparse compression block, as the value is later
+          reset without being read/used. (Rusty Russell)
+        - Switch to using the new KM_BIO_SRC_IRQ for atomic kmap()s. (Andrew
+          Morton)
+        - Change buffer size in ntfs_readdir()/ntfs_filldir() to use
+          NLS_MAX_CHARSET_SIZE which makes the buffers almost 1kiB each but
+          it also makes everything safer so it is a good thing.
+        - Miscellaneous minor cleanups to comments.
+2.0.8 - Major updates for handling of case sensitivity and dcache aliasing.
+        Big thanks go to Al Viro and other inhabitants of #kernel for investing
+        their time to discuss the case sensitivity and dcache aliasing issues.
+        - Remove unused source file fs/ntfs/attraops.c.
+        - Remove show_inodes mount option(s), thus dropping support for
+          displaying of short file names.
+        - Remove deprecated mount option posix.
+        - Restore show_sys_files mount option.
+        - Add new mount option case_sensitive, to determine if the driver
+          treats file names as case sensitive or not. If case sensitive, create
+          file names in the POSIX namespace. Otherwise create file names in the
+          LONG/WIN32 namespace. Note, files remain accessible via their short
+          file name, if it exists.
+        - Remove really dumb logic bug in boot sector recovery code.
+        - Fix dcache aliasing issues wrt short/long file names via changes
+          to fs/ntfs/dir.c::ntfs_lookup_inode_by_name() and
+          fs/ntfs/namei.c::ntfs_lookup():
+          - Add additional argument to ntfs_lookup_inode_by_name() in which we
+            return information about the matching file name if the case is not
+            matching or the match is a short file name. See comments above the
+            function definition for details.
+          - Change ntfs_lookup() to only create dcache entries for the correctly
+            cased file name and only for the WIN32 namespace counterpart of DOS
+            namespace file names. This ensures we have only one dentry per
+            directory and also removes all dcache aliasing issues between short
+            and long file names once we add write support. See comments above
+            function for details.
+        - Fix potential 1 byte overflow in fs/ntfs/unistr.c::ntfs_ucstonls().
+2.0.7 - Minor cleanups and updates for changes in core kernel code.
+        - Remove much of the NULL struct element initializers.
+        - Various updates to make compatible with recent kernels.
+        - Remove defines of MAX_BUF_PER_PAGE and include linux/buffer_head.h
+          in fs/ntfs/ntfs.h instead.
+        - Remove no longer needed KERNEL_VERSION checks. We are now in the
+          kernel proper so they are no longer needed.
+2.0.6 - Major bugfix to make compatible with other kernel changes.
+        - Initialize the mftbmp address space properly now that there are more
+          fields in the struct address_space. This was leading to hangs and
+          oopses on umount since 2.5.12 because of changes to other parts of
+          the kernel. We probably want a kernel generic init_address_space()
+          function...
+        - Drop BKL from ntfs_readdir() after consultation with Al Viro. The
+          only caller of ->readdir() is vfs_readdir() which holds i_sem during
+          the call, and i_sem is sufficient protection against changes in the
+          directory inode (including ->i_size).
+        - Use generic_file_llseek() for directories (as opposed to
+          default_llseek()) as this downs i_sem instead of the BKL which is
+          what we now need for exclusion against ->f_pos changes considering we
+          no longer take the BKL in ntfs_readdir().
+2.0.5 - Major bugfix. Buffer overflow in extent inode handling.
+        - No need to set old blocksize in super.c::ntfs_fill_super() as the
+          VFS does so via invocation of deactivate_super() calling
+          fs->fill_super() calling block_kill_super() which does it.
+        - BKL moved from VFS into dir.c::ntfs_readdir(). (Linus Torvalds)
+          -> Do we really need it? I don't think so as we have exclusion on
+          the directory ntfs_inode rw_semaphore mrec_lock. We mmight have to
+          move the ->f_pos accesses under the mrec_lock though. Check this...
+        - Fix really, really, really stupid buffer overflow in extent inode
+          handling in mft.c::map_extent_mft_record().
+2.0.4 - Cleanups and updates for kernel 2.5.11.
+        - Add documentation on how to use the MD driver to be able to use NTFS
+          stripe and volume sets in Linux and generally cleanup documentation
+          a bit.
+        Remove all uses of kdev_t in favour of struct block_device *:
+        - Change compress.c::ntfs_file_read_compressed_block() to use
+          sb_getblk() instead of getblk().
+        - Change super.c::ntfs_fill_super() to use bdev_hardsect_size() instead
+          of get_hardsect_size().
+        - No need to get old blocksize in super.c::ntfs_fill_super() as
+          fs/super.c::get_sb_bdev() already does this.
+        - Set bh->b_bdev instead of bh->b_dev throughout aops.c.
+2.0.3 - Small bug fixes, cleanups, and performance improvements.
+        - Remove some dead code from mft.c.
+        - Optimize readpage and read_block functions throughout aops.c so that
+          only initialized blocks are read. Non-initialized ones have their
+          buffer head mapped, zeroed, and set up to date, without scheduling
+          any i/o. Thanks to Al Viro for advice on how to avoid the device i/o.
+        Thanks go to Andrew Morton for spotting the below:
+        - Fix buglet in allocate_compression_buffers() error code path.
+        - Call flush_dcache_page() after modifying page cache page contents in
+          ntfs_file_readpage().
+        - Check for existence of page buffers throughout aops.c before calling
+          create_empty_buffers(). This happens when an I/O error occurs and the
+          read is retried. (It also happens once writing is implemented so that
+          needed doing anyway but I had left it for later...)
+        - Don't BUG_ON() uptodate and/or mapped buffers throughout aops.c in
+          readpage and read_block functions. Reasoning same as above (i.e. I/O
+          error retries and future write code paths.)
+2.0.2 - Minor updates and cleanups.
+        - Cleanup: rename mst.c::__post_read_mst_fixup to post_write_mst_fixup
+          and cleanup the code a bit, removing the unused size parameter.
+        - Change default fmask to 0177 and update documentation.
+        - Change attrib.c::get_attr_search_ctx() to return the search context
+          directly instead of taking the address of a pointer. A return value
+          of NULL means the allocation failed. Updated all callers
+          appropriately.
+        - Update to 2.5.9 kernel (preserving backwards compatibility) by
+          replacing all occurences of page->buffers with page_buffers(page).
+        - Fix minor bugs in runlist merging, also minor cleanup.
+        - Updates to bootsector layout and mft mirror contents descriptions.
+        - Small bug fix in error detection in unistr.c and some cleanups.
+        - Grow name buffer allocations in unistr.c in aligned mutlipled of 64
+          bytes.
+2.0.1 - Minor updates.
+        - Make default umask correspond to documentation.
+        - Improve documentation.
+        - Set default mode to include execute bit. The {u,f,d}mask can be used
+          to take it away if desired. This allows binaries to be executed from
+          a mounted ntfs partition.
+2.0.0 - New version number. Remove TNG from the name. Now in the kernel.
+        - Add kill_super, just keeping up with the vfs changes in the kernel.
+        - Repeat some changes from tng-0.0.8 that somehow got lost on the way
+          from the CVS import into BitKeeper.
+        - Begin to implement proper handling of allocated_size vs
+          initialized_size vs data_size (i.e. i_size). Done are
+          mft.c::ntfs_mft_readpage(), aops.c::end_buffer_read_index_async(),
+          and attrib.c::load_attribute_list().
+        - Lock the runlist in attrib.c::load_attribute_list() while using it.
+        - Fix memory leak in ntfs_file_read_compressed_block() and generally
+          clean up compress.c a little, removing some uncommented/unused debug
+          code.
+        - Tidy up dir.c a little bit.
+        - Don't bother getting the runlist in inode.c::ntfs_read_inode().
+        - Merge mft.c::ntfs_mft_readpage() and aops.c::ntfs_index_readpage()
+          creating aops.c::ntfs_mst_readpage(), improving the handling of
+          holes and overflow in the process and implementing the correct
+          equivalent of ntfs_file_get_block() in ntfs_mst_readpage() itself.
+          I am aiming for correctness at the moment. Modularisation can come
+          later.
+        - Rename aops.c::end_buffer_read_index_async() to
+          end_buffer_read_mst_async() and optimize the overflow checking and
+          handling.
+        - Use the host of the mftbmp address space mapping to hold the ntfs
+          volume. This is needed so the async i/o completion handler can
+          retrieve a pointer to the volume. Hopefully this will not cause
+          problems elsewhere in the kernel... Otherwise will need to use a
+          fake inode.
+        - Complete implementation of proper handling of allocated_size vs
+          initialized_size vs data_size (i.e. i_size) in whole driver.
+          Basically aops.c is now completely rewritten.
+        - Change NTFS driver name to just NTFS and set version number to 2.0.0
+          to make a clear distinction from the old driver which is still on
+          version 1.1.22.
+tng-0.0.8 - 08/03/2002 - Now using BitKeeper, http://linux-ntfs.bkbits.net/
+        - Replace bdevname(sb->s_dev) with sb->s_id.
+        - Remove now superfluous new-line characters in all callers of
+          ntfs_debug().
+        - Apply kludge in ntfs_read_inode(), setting i_nlink to 1 for
+          directories. Without this the "find" utility gets very upset which is
+          fair enough as Linux/Unix do not support directory hard links.
+        - Further runlist merging work. (Richard Russon)
+        - Backwards compatibility for gcc-2.95. (Richard Russon)
+        - Update to kernel 2.5.5-pre1 and rediff the now tiny patch.
+        - Convert to new file system declaration using ->ntfs_get_sb() and
+          replacing ntfs_read_super() with ntfs_fill_super().
+        - Set s_maxbytes to MAX_LFS_FILESIZE to avoid page cache page index
+          overflow on 32-bit architectures.
+        - Cleanup upcase loading code to use ntfs_(un)map_page().
+        - Disable/reenable preemtion in critical sections of compession engine.
+        - Replace device size determination in ntfs_fill_super() with
+          sb->s_bdev->bd_inode->i_size (in bytes) and remove now superfluous
+          function super.c::get_nr_blocks().
+        - Implement a mount time option (show_inodes) allowing choice of which
+          types of inode names readdir() returns and modify ntfs_filldir()
+          accordingly. There are several parameters to show_inodes:
+                system: system files
+                win32:  long file names (including POSIX file names) [DEFAULT]
+                long:   same as win32
+                dos:    short file names only (excluding POSIX file names)
+                short:  same as dos
+                posix:  same as both win32 and dos
+                all:    all file names
+          Note that the options are additive, i.e. specifying:
+                -o show_inodes=system,show_inodes=win32,show_inodes=dos
+          is the same as specifying:
+                -o show_inodes=all
+          Note that the "posix" and "all" options will show all directory
+          names, BUT the link count on each directory inode entry is set to 1,
+          due to Linux not supporting directory hard links. This may well
+          confuse some userspace applications, since the directory names will
+          have the same inode numbers. Thus it is NOT advisable to use the
+          "posix" or "all" options. We provide them only for completeness sake.
+        - Add copies of allocated_size, initialized_size, and compressed_size to
+          the ntfs inode structure and set them up in
+          inode.c::ntfs_read_inode(). These reflect the unnamed data attribute
+          for files and the index allocation attribute for directories.
+        - Add copies of allocated_size and initialized_size to ntfs inode for
+          $BITMAP attribute of large directories and set them up in
+          inode.c::ntfs_read_inode().
+        - Add copies of allocated_size and initialized_size to ntfs volume for
+          $BITMAP attribute of $MFT and set them up in
+          super.c::load_system_files().
+        - Parse deprecated ntfs driver options (iocharset, show_sys_files,
+          posix, and utf8) and tell user what the new options to use are. Note
+          we still do support them but they will be removed with kernel 2.7.x.
+        - Change all occurences of integer long long printf formatting to hex
+          as printk() will not support long long integer format if/when the
+          div64 patch goes into the kernel.
+        - Make slab caches have stable names and change the names to what they
+          were intended to be. These changes are required/made possible by the
+          new slab cache name handling which removes the length limitation by
+          requiring the caller of kmem_cache_create() to supply a stable name
+          which is then referenced but not copied.
+        - Rename run_list structure to run_list_element and create a new
+          run_list structure containing a pointer to a run_list_element
+          structure and a read/write semaphore. Adapt all users of runlists
+          to new scheme and take and release the lock as needed. This fixes a
+          nasty race as the run_list changes even when inodes are locked for
+          reading and even when the inode isn't locked at all, so we really
+          needed the serialization. We use a semaphore rather than a spinlock
+          as memory allocations can sleep and doing everything GFP_ATOMIC
+          would be silly.
+        - Cleanup read_inode() removing all code checking for lowest_vcn != 0.
+          This can never happen due to the nature of lookup_attr() and how we
+          support attribute lists. If it did happen it would imply the inode
+          being corrupt.
+        - Check for lowest_vcn != 0 in ntfs_read_inode() and mark the inode as
+          bad if found.
+        - Update to 2.5.6-pre2 changes in struct address_space.
+        - Use parent_ino() when accessing d_parent inode number in dir.c.
+        - Import Sourceforge CVS repository into BitKeeper repository:
+                http://linux-ntfs.bkbits.net/ntfs-tng-2.5
+        - Update fs/Makefile, fs/Config.help, fs/Config.in, and
+          Documentation/filesystems/ntfs.txt for NTFS TNG.
+        - Create kernel configuration option controlling whether debugging
+          is enabled or not.
+        - Add the required export of end_buffer_io_sync() from the patches
+          directory to the kernel code.
+        - Update inode.c::ntfs_show_options() with show_inodes mount option.
+        - Update errors mount option.
+tng-0.0.7 - 13/02/2002 - The driver is now feature complete for read-only!
+        - Cleanup mft.c and it's debug/error output in particular. Fix a minor
+          bug in mapping of extent inodes. Update all the comments to fit all
+          the recent code changes.
+        - Modify vcn_to_lcn() to cope with entirely unmapped runlists.
+        - Cleanups in compress.c, mostly comments and folding help.
+        - Implement attrib.c::map_run_list() as a generic helper.
+        - Make compress.c::ntfs_file_read_compressed_block() use map_run_list()
+          thus making code shorter and enabling attribute list support.
+        - Cleanup incorrect use of [su]64 with %L printf format specifier in
+          all source files. Type casts to [unsigned] long long added to correct
+          the mismatches (important for architectures which have long long not
+          being 64 bits).
+        - Merge async io completion handlers for directory indexes and $MFT
+          data into one by setting the index_block_size{_bits} of the ntfs
+          inode for $MFT to the mft_record_size{_bits} of the ntfs_volume.
+        - Cleanup aops.c, update comments.
+        - Make ntfs_file_get_block() use map_run_list() so all files now
+          support attribute lists.
+        - Make ntfs_dir_readpage() almost verbatim copy of
+          block_read_full_page() by using ntfs_file_get_block() with only real
+          difference being the use of our own async io completion handler
+          rather than the default one, thus reducing the amount of code and
+          automatically enabling attribute list support for directory indices.
+        - Fix bug in load_attribute_list() - forgot to call brelse in error
+          code path.
+        - Change parameters to find_attr() and lookup_attr(). We no longer
+          pass in the upcase table and its length. These can be gotten from
+          ctx->ntfs_ino->vol->upcase{_len}. Update all callers.
+        - Cleanups in attrib.c.
+        - Implement merging of runlists, attrib.c::merge_run_lists() and its
+          helpers. (Richard Russon)
+        - Attribute lists part 2, attribute extents and multi part runlists:
+          enable proper support for LCN_RL_NOT_MAPPED and automatic mapping of
+          further runlist parts via attrib.c::map_run_list().
+        - Tiny endianness bug fix in decompress_mapping_pairs().
+tng-0.0.6 - Encrypted directories, bug fixes, cleanups, debugging enhancements.
+        - Enable encrypted directories. (Their index root is marked encrypted
+          to indicate that new files in that directory should be created
+          encrypted.)
+        - Fix bug in NInoBmpNonResident() macro. (Cut and paste error.)
+        - Enable $Extend system directory. Most (if not all) extended system
+          files do not have unnamed data attributes so ntfs_read_inode() had to
+          special case them but that is ok, as the special casing recovery
+          happens inside an error code path so there is zero slow down in the
+          normal fast path. The special casing is done by introducing a new
+          function inode.c::ntfs_is_extended_system_file() which checks if any
+          of the hard links in the inode point to $Extend as being their parent
+          directory and if they do we assume this is an extended system file.
+        - Create a sysctl/proc interface to allow {dis,en}abling of debug output
+          when compiled with -DDEBUG. Default is debug messages to be disabled.
+          To enable them, one writes a non-zero value to /proc/sys/fs/ntfs-debug
+          (if /proc is enabled) or uses sysctl(2) to effect the same (if sysctl
+          interface is enabled). Inspired by old ntfs driver.
+        - Add debug_msgs insmod/kernel boot parameter to set whether debug
+          messages are {dis,en}abled. This is useful to enable debug messages
+          during ntfs initialization and is the only way to activate debugging
+          when the sysctl interface is not enabled.
+        - Cleanup debug output in various places.
+        - Remove all dollar signs ($) from the source (except comments) to
+          enable compilation on architectures whose gcc compiler does not
+          support dollar signs in the names of variables/constants. Attribute
+          types now start with AT_ instead of $ and $I30 is now just I30.
+        - Cleanup ntfs_lookup() and add consistency check of sequence numbers.
+        - Load complete runlist for $MFT/$BITMAP during mount and cleanup
+          access functions. This means we now cope with $MFT/$BITMAP being
+          spread accross several mft records.
+        - Disable modification of mft_zone_multiplier on remount. We can always
+          reenable this later on if we really want to, but we will need to make
+          sure we readjust the mft_zone size / layout accordingly.
+tng-0.0.5 - Modernize for 2.5.x and further in line-ing with Al Viro's comments.
+        - Use sb_set_blocksize() instead of set_blocksize() and verify the
+          return value.
+        - Use sb_bread() instead of bread() throughout.
+        - Add index_vcn_size{_bits} to ntfs_inode structure to store the size
+          of a directory index block vcn. Apply resulting simplifications in
+          dir.c everywhere.
+        - Fix a small bug somewhere (but forgot what it was).
+        - Change ntfs_{debug,error,warning} to enable gcc to do type checking
+          on the printf-format parameter list and fix bugs reported by gcc
+          as a result. (Richard Russon)
+        - Move inode allocation strategy to Al's new stuff but maintain the
+          divorce of ntfs_inode from struct inode. To achieve this we have two
+          separate slab caches, one for big ntfs inodes containing a struct
+          inode and pure ntfs inodes and at the same time fix some faulty
+          error code paths in ntfs_read_inode().
+        - Show mount options in proc (inode.c::ntfs_show_options()).
+tng-0.0.4 - Big changes, getting in line with Al Viro's comments.
+        - Modified (un)map_mft_record functions to be common for read and write
+          case. To specify which is which, added extra parameter at front of
+          parameter list. Pass either READ or WRITE to this, each has the
+          obvious meaning.
+        - General cleanups to allow for easier folding in vi.
+        - attrib.c::decompress_mapping_pairs() now accepts the old runlist
+          argument, and invokes attrib.c::merge_run_lists() to merge the old
+          and the new runlists.
+        - Removed attrib.c::find_first_attr().
+        - Implemented loading of attribute list and complete runlist for $MFT.
+          This means we now cope with $MFT being spread across several mft
+          records.
+        - Adapt to 2.5.2-pre9 and the changed create_empty_buffers() syntax.
+        - Adapt major/minor/kdev_t/[bk]devname stuff to new 2.5.x kernels.
+        - Make ntfs_volume be allocated via kmalloc() instead of using a slab
+          cache. There are too little ntfs_volume structures at any one time
+          to justify a private slab cache.
+        - Fix bogus kmap() use in async io completion. Now use kmap_atomic().
+          Use KM_BIO_IRQ on advice from IRC/kernel...
+        - Use ntfs_map_page() in map_mft_record() and create ->readpage method
+          for reading $MFT (ntfs_mft_readpage). In the process create dedicated
+          address space operations (ntfs_mft_aops) for $MFT inode mapping. Also
+          removed the now superfluous exports from the kernel core patch.
+        - Fix a bug where kfree() was used insted of ntfs_free().
+        - Change map_mft_record() to take ntfs_inode as argument instead of
+          vfs inode. Dito for unmap_mft_record(). Adapt all callers.
+        - Add pointer to ntfs_volume to ntfs_inode.
+        - Add mft record number and sequence number to ntfs_inode. Stop using
+          i_ino and i_generation for in-driver purposes.
+        - Implement attrib.c::merge_run_lists(). (Richard Russon)
+        - Remove use of proper inodes by extent inodes. Move i_ino and
+          i_generation to ntfs_inode to do this. Apply simplifications that
+          result and remove iget_no_wait(), etc.
+        - Pass ntfs_inode everywhere in the driver (used to be struct inode).
+        - Add reference counting in ntfs_inode for the ntfs inode itself and
+          for the mapped mft record.
+        - Extend mft record mapping so we can (un)map extent mft records (new
+          functions (un)map_extent_mft_record), and so mappings are reference
+          counted and don't have to happen twice if already mapped - just ref
+          count increases.
+        - Add -o iocharset as alias to -o nls for backwards compatibility.
+        - The latest core patch is now tiny. In fact just a single additional
+          export is necessary over the base kernel.
+tng-0.0.3 - Cleanups, enhancements, bug fixes.
+        - Work on attrib.c::decompress_mapping_pairs() to detect base extents
+          and setup the runlist appropriately using knowledge provided by the
+          sizes in the base attribute record.
+        - Balance the get_/put_attr_search_ctx() calls so we don't leak memory
+          any more.
+        - Introduce ntfs_malloc_nofs() and ntfs_free() to allocate/free a single
+          page or use vmalloc depending on the amount of memory requested.
+        - Cleanup error output. The __FUNCTION__ "(): " is now added
+          automatically. Introduced a new header file debug.h to support this
+          and also moved ntfs_debug() function into it.
+        - Make reading of compressed files more intelligent and especially get
+          rid of the vmalloc_nofs() from readpage(). This now uses per CPU
+          buffers (allocated at first mount with cluster size <= 4kiB and
+          deallocated on last umount with cluster size <= 4kiB), and
+          asynchronous io for the compressed data using a list of buffer heads.
+          Er, we use synchronous io as async io only works on whole pages
+          covered by buffers and not on individual buffer heads...
+        - Bug fix for reading compressed files with sparse compression blocks.
+tng-0.0.2 - Now handles larger/fragmented/compressed volumes/files/dirs.
+        - Fixed handling of directories when cluster size exceeds index block
+          size.
+        - Hide DOS only name space directory entries from readdir() but allow
+          them in lookup(). This should fix the problem that Linux doesn't
+          support directory hard links, while still allowing access to entries
+          via their short file name. This also has the benefit of mimicking
+          what Windows users are used to, so it is the ideal solution.
+        - Implemented sync_page everywhere so no more hangs in D state when
+          waiting for a page.
+        - Stop using bforget() in favour of brelse().
+        - Stop locking buffers unnecessarily.
+        - Implemented compressed files (inode->mapping contains uncompressed
+          data, raw compressed data is currently bread() into a vmalloc()ed
+          memory buffer).
+        - Enable compressed directories. (Their index root is marked compressed
+          to indicate that new files in that directory should be created
+          compressed.)
+        - Use vsnprintf rather than vsprintf in the ntfs_error and ntfs_warning
+          functions. (Thanks to Will Dyson for pointing this out.)
+        - Moved the ntfs_inode and ntfs_volume (the former ntfs_inode_info and
+          ntfs_sb_info) out of the common inode and super_block structures and
+          started using the generic_ip and generic_sbp pointers instead. This
+          makes ntfs entirely private with respect to the kernel tree.
+        - Detect compiler version and abort with error message if gcc less than
+          2.96 is used.
+        - Fix bug in name comparison function in unistr.c.
+        - Implement attribute lists part 1, the infrastructure: search contexts
+          and operations, find_external_attr(), lookup_attr()) and make the
+          code use the infrastructure.
+        - Fix stupid buffer overflow bug that became apparent on larger run
+          list containing attributes.
+        - Fix bugs in readdir() that became apparent on larger directories.
+        The driver is now really useful and survives the test
+                find . -type f -exec md5sum "{}" \;
+        without any error messages on a over 1GiB sized partition with >16k
+        files on it, including compressed files and directories and many files
+        and directories with attribute lists.
+tng-0.0.1 - The first useful version.
+        - Added ntfs_lookup().
+        - Added default upcase generation and handling.
+        - Added compile options to be shown on module init.
+        - Many bug fixes that were "hidden" before.
+        - Update to latest kernel.
+        - Added ntfs_readdir().
+        - Added file operations for mmap(), read(), open() and llseek(). We just
+          use the generic ones. The whole point of going through implementing
+          readpage() methods and where possible get_block() call backs is that
+          this allows us to make use of the generic high level methods provided
+          by the kernel.
+        The driver is now actually useful! Yey. (-: It undoubtedly has got bugs
+        though and it doesn't implement accesssing compressed files yet. Also,
+        accessing files with attribute list attributes is not implemented yet
+        either. But for small or simple file systems it should work and allow
+        you to list directories, use stat on directory entries and the file
+        system, open, read, mmap and llseek around in files. A big mile stone
+        has been reached!
+tng-0.0.0 - Initial version tag.
+        Initial driver implementation. The driver can mount and umount simple
+        NTFS file systems (i.e. ones without attribute lists in the system
+        files). If the mount fails there might be problems in the error handling
+        code paths, so be warned. Otherwise it seems to be loading the system
+        files nicely and the mft record read mapping/unmapping seems to be
+        working nicely, too. Proof of inode metadata in the page cache and non-
+        resident file unnamed stream data in the page cache concepts is thus
+        complete.
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
new file mode 100644
index 000000000000..7b66381a0b0f
--- /dev/null
+++ b/fs/ntfs/Makefile
@@ -0,0 +1,19 @@
+# Rules for making the NTFS driver.
+obj-$(CONFIG_NTFS_FS) += ntfs.o
+ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
+             index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
+             unistr.o upcase.o
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.22\"
+ifeq ($(CONFIG_NTFS_DEBUG),y)
+EXTRA_CFLAGS += -DDEBUG
+endif
+ifeq ($(CONFIG_NTFS_RW),y)
+EXTRA_CFLAGS += -DNTFS_RW
+ntfs-objs += bitmap.o lcnalloc.o logfile.o quota.o
+endif
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
new file mode 100644
index 000000000000..45d56e41ed98
--- /dev/null
+++ b/fs/ntfs/aops.c
@@ -0,0 +1,2324 @@
+/**
+ * aops.c - NTFS kernel address space operations and page cache handling.
+ *          Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include "aops.h"
+#include "attrib.h"
+#include "debug.h"
+#include "inode.h"
+#include "mft.h"
+#include "runlist.h"
+#include "types.h"
+#include "ntfs.h"
+/**
+ * ntfs_end_buffer_async_read - async io completion for reading attributes
+ * @bh:         buffer head on which io is completed
+ * @uptodate:   whether @bh is now uptodate or not
+ *
+ * Asynchronous I/O completion handler for reading pages belonging to the
+ * attribute address space of an inode.  The inodes can either be files or
+ * directories or they can be fake inodes describing some attribute.
+ *
+ * If NInoMstProtected(), perform the post read mst fixups when all IO on the
+ * page has been completed and mark the page uptodate or set the error bit on
+ * the page.  To determine the size of the records that need fixing up, we
+ * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs
+ * record size, and index_block_size_bits, to the log(base 2) of the ntfs
+ * record size.
+ */
+static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
+{
+        static DEFINE_SPINLOCK(page_uptodate_lock);
+        unsigned long flags;
+        struct buffer_head *tmp;
+        struct page *page;
+        ntfs_inode *ni;
+        int page_uptodate = 1;
+        page = bh->b_page;
+        ni = NTFS_I(page->mapping->host);
+        if (likely(uptodate)) {
+                s64 file_ofs;
+                set_buffer_uptodate(bh);
+                file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) +
+                                bh_offset(bh);
+                /* Check for the current buffer head overflowing. */
+                if (file_ofs + bh->b_size > ni->initialized_size) {
+                        char *addr;
+                        int ofs = 0;
+                        if (file_ofs < ni->initialized_size)
+                                ofs = ni->initialized_size - file_ofs;
+                        addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
+                        memset(addr + bh_offset(bh) + ofs, 0, bh->b_size - ofs);
+                        flush_dcache_page(page);
+                        kunmap_atomic(addr, KM_BIO_SRC_IRQ);
+                }
+        } else {
+                clear_buffer_uptodate(bh);
+                ntfs_error(ni->vol->sb, "Buffer I/O error, logical block %llu.",
+                                (unsigned long long)bh->b_blocknr);
+                SetPageError(page);
+        }
+        spin_lock_irqsave(&page_uptodate_lock, flags);
+        clear_buffer_async_read(bh);
+        unlock_buffer(bh);
+        tmp = bh;
+        do {
+                if (!buffer_uptodate(tmp))
+                        page_uptodate = 0;
+                if (buffer_async_read(tmp)) {
+                        if (likely(buffer_locked(tmp)))
+                                goto still_busy;
+                        /* Async buffers must be locked. */
+                        BUG();
+                }
+                tmp = tmp->b_this_page;
+        } while (tmp != bh);
+        spin_unlock_irqrestore(&page_uptodate_lock, flags);
+        /*
+         * If none of the buffers had errors then we can set the page uptodate,
+         * but we first have to perform the post read mst fixups, if the
+         * attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
+         * Note we ignore fixup errors as those are detected when
+         * map_mft_record() is called which gives us per record granularity
+         * rather than per page granularity.
+         */
+        if (!NInoMstProtected(ni)) {
+                if (likely(page_uptodate && !PageError(page)))
+                        SetPageUptodate(page);
+        } else {
+                char *addr;
+                unsigned int i, recs;
+                u32 rec_size;
+                rec_size = ni->itype.index.block_size;
+                recs = PAGE_CACHE_SIZE / rec_size;
+                /* Should have been verified before we got here... */
+                BUG_ON(!recs);
+                addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
+                for (i = 0; i < recs; i++)
+                        post_read_mst_fixup((NTFS_RECORD*)(addr +
+                                        i * rec_size), rec_size);
+                flush_dcache_page(page);
+                kunmap_atomic(addr, KM_BIO_SRC_IRQ);
+                if (likely(!PageError(page) && page_uptodate))
+                        SetPageUptodate(page);
+        }
+        unlock_page(page);
+        return;
+still_busy:
+        spin_unlock_irqrestore(&page_uptodate_lock, flags);
+        return;
+}
+/**
+ * ntfs_read_block - fill a @page of an address space with data
+ * @page:       page cache page to fill with data
+ *
+ * Fill the page @page of the address space belonging to the @page->host inode.
+ * We read each buffer asynchronously and when all buffers are read in, our io
+ * completion handler ntfs_end_buffer_read_async(), if required, automatically
+ * applies the mst fixups to the page before finally marking it uptodate and
+ * unlocking it.
+ *
+ * We only enforce allocated_size limit because i_size is checked for in
+ * generic_file_read().
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Contains an adapted version of fs/buffer.c::block_read_full_page().
+ */
+static int ntfs_read_block(struct page *page)
+{
+        VCN vcn;
+        LCN lcn;
+        ntfs_inode *ni;
+        ntfs_volume *vol;
+        runlist_element *rl;
+        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+        sector_t iblock, lblock, zblock;
+        unsigned int blocksize, vcn_ofs;
+        int i, nr;
+        unsigned char blocksize_bits;
+        ni = NTFS_I(page->mapping->host);
+        vol = ni->vol;
+        /* $MFT/$DATA must have its complete runlist in memory at all times. */
+        BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
+        blocksize_bits = VFS_I(ni)->i_blkbits;
+        blocksize = 1 << blocksize_bits;
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, blocksize, 0);
+        bh = head = page_buffers(page);
+        if (unlikely(!bh)) {
+                unlock_page(page);
+                return -ENOMEM;
+        }
+        iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
+        lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
+        zblock = (ni->initialized_size + blocksize - 1) >> blocksize_bits;
+        /* Loop through all the buffers in the page. */
+        rl = NULL;
+        nr = i = 0;
+        do {
+                u8 *kaddr;
+                if (unlikely(buffer_uptodate(bh)))
+                        continue;
+                if (unlikely(buffer_mapped(bh))) {
+                        arr[nr++] = bh;
+                        continue;
+                }
+                bh->b_bdev = vol->sb->s_bdev;
+                /* Is the block within the allowed limits? */
+                if (iblock < lblock) {
+                        BOOL is_retry = FALSE;
+                        /* Convert iblock into corresponding vcn and offset. */
+                        vcn = (VCN)iblock << blocksize_bits >>
+                                        vol->cluster_size_bits;
+                        vcn_ofs = ((VCN)iblock << blocksize_bits) &
+                                        vol->cluster_size_mask;
+                        if (!rl) {
+lock_retry_remap:
+                                down_read(&ni->runlist.lock);
+                                rl = ni->runlist.rl;
+                        }
+                        if (likely(rl != NULL)) {
+                                /* Seek to element containing target vcn. */
+                                while (rl->length && rl[1].vcn <= vcn)
+                                        rl++;
+                                lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+                        } else
+                                lcn = LCN_RL_NOT_MAPPED;
+                        /* Successful remap. */
+                        if (lcn >= 0) {
+                                /* Setup buffer head to correct block. */
+                                bh->b_blocknr = ((lcn << vol->cluster_size_bits)
+                                                + vcn_ofs) >> blocksize_bits;
+                                set_buffer_mapped(bh);
+                                /* Only read initialized data blocks. */
+                                if (iblock < zblock) {
+                                        arr[nr++] = bh;
+                                        continue;
+                                }
+                                /* Fully non-initialized data block, zero it. */
+                                goto handle_zblock;
+                        }
+                        /* It is a hole, need to zero it. */
+                        if (lcn == LCN_HOLE)
+                                goto handle_hole;
+                        /* If first try and runlist unmapped, map and retry. */
+                        if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
+                                int err;
+                                is_retry = TRUE;
+                                /*
+                                 * Attempt to map runlist, dropping lock for
+                                 * the duration.
+                                 */
+                                up_read(&ni->runlist.lock);
+                                err = ntfs_map_runlist(ni, vcn);
+                                if (likely(!err))
+                                        goto lock_retry_remap;
+                                rl = NULL;
+                                lcn = err;
+                        }
+                        /* Hard error, zero out region. */
+                        bh->b_blocknr = -1;
+                        SetPageError(page);
+                        ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
+                                        "attribute type 0x%x, vcn 0x%llx, "
+                                        "offset 0x%x because its location on "
+                                        "disk could not be determined%s "
+                                        "(error code %lli).", ni->mft_no,
+                                        ni->type, (unsigned long long)vcn,
+                                        vcn_ofs, is_retry ? " even after "
+                                        "retrying" : "", (long long)lcn);
+                }
+                /*
+                 * Either iblock was outside lblock limits or
+                 * ntfs_rl_vcn_to_lcn() returned error.  Just zero that portion
+                 * of the page and set the buffer uptodate.
+                 */
+handle_hole:
+                bh->b_blocknr = -1UL;
+                clear_buffer_mapped(bh);
+handle_zblock:
+                kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr + i * blocksize, 0, blocksize);
+                flush_dcache_page(page);
+                kunmap_atomic(kaddr, KM_USER0);
+                set_buffer_uptodate(bh);
+        } while (i++, iblock++, (bh = bh->b_this_page) != head);
+        /* Release the lock if we took it. */
+        if (rl)
+                up_read(&ni->runlist.lock);
+        /* Check we have at least one buffer ready for i/o. */
+        if (nr) {
+                struct buffer_head *tbh;
+                /* Lock the buffers. */
+                for (i = 0; i < nr; i++) {
+                        tbh = arr[i];
+                        lock_buffer(tbh);
+                        tbh->b_end_io = ntfs_end_buffer_async_read;
+                        set_buffer_async_read(tbh);
+                }
+                /* Finally, start i/o on the buffers. */
+                for (i = 0; i < nr; i++) {
+                        tbh = arr[i];
+                        if (likely(!buffer_uptodate(tbh)))
+                                submit_bh(READ, tbh);
+                        else
+                                ntfs_end_buffer_async_read(tbh, 1);
+                }
+                return 0;
+        }
+        /* No i/o was scheduled on any of the buffers. */
+        if (likely(!PageError(page)))
+                SetPageUptodate(page);
+        else /* Signal synchronous i/o error. */
+                nr = -EIO;
+        unlock_page(page);
+        return nr;
+}
+/**
+ * ntfs_readpage - fill a @page of a @file with data from the device
+ * @file:       open file to which the page @page belongs or NULL
+ * @page:       page cache page to fill with data
+ *
+ * For non-resident attributes, ntfs_readpage() fills the @page of the open
+ * file @file by calling the ntfs version of the generic block_read_full_page()
+ * function, ntfs_read_block(), which in turn creates and reads in the buffers
+ * associated with the page asynchronously.
+ *
+ * For resident attributes, OTOH, ntfs_readpage() fills @page by copying the
+ * data from the mft record (which at this stage is most likely in memory) and
+ * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
+ * even if the mft record is not cached at this point in time, we need to wait
+ * for it to be read in before we can do the copy.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_readpage(struct file *file, struct page *page)
+{
+        loff_t i_size;
+        ntfs_inode *ni, *base_ni;
+        u8 *kaddr;
+        ntfs_attr_search_ctx *ctx;
+        MFT_RECORD *mrec;
+        u32 attr_len;
+        int err = 0;
+        BUG_ON(!PageLocked(page));
+        /*
+         * This can potentially happen because we clear PageUptodate() during
+         * ntfs_writepage() of MstProtected() attributes.
+         */
+        if (PageUptodate(page)) {
+                unlock_page(page);
+                return 0;
+        }
+        ni = NTFS_I(page->mapping->host);
+        /* NInoNonResident() == NInoIndexAllocPresent() */
+        if (NInoNonResident(ni)) {
+                /*
+                 * Only unnamed $DATA attributes can be compressed or
+                 * encrypted.
+                 */
+                if (ni->type == AT_DATA && !ni->name_len) {
+                        /* If file is encrypted, deny access, just like NT4. */
+                        if (NInoEncrypted(ni)) {
+                                err = -EACCES;
+                                goto err_out;
+                        }
+                        /* Compressed data streams are handled in compress.c. */
+                        if (NInoCompressed(ni))
+                                return ntfs_read_compressed_block(page);
+                }
+                /* Normal data stream. */
+                return ntfs_read_block(page);
+        }
+        /*
+         * Attribute is resident, implying it is not compressed or encrypted.
+         * This also means the attribute is smaller than an mft record and
+         * hence smaller than a page, so can simply zero out any pages with
+         * index above 0.  We can also do this if the file size is 0.
+         */
+        if (unlikely(page->index > 0 || !i_size_read(VFS_I(ni)))) {
+                kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr, 0, PAGE_CACHE_SIZE);
+                flush_dcache_page(page);
+                kunmap_atomic(kaddr, KM_USER0);
+                goto done;
+        }
+        if (!NInoAttr(ni))
+                base_ni = ni;
+        else
+                base_ni = ni->ext.base_ntfs_ino;
+        /* Map, pin, and lock the mft record. */
+        mrec = map_mft_record(base_ni);
+        if (IS_ERR(mrec)) {
+                err = PTR_ERR(mrec);
+                goto err_out;
+        }
+        ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
+        if (unlikely(!ctx)) {
+                err = -ENOMEM;
+                goto unm_err_out;
+        }
+        err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+                        CASE_SENSITIVE, 0, NULL, 0, ctx);
+        if (unlikely(err))
+                goto put_unm_err_out;
+        attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
+        i_size = i_size_read(VFS_I(ni));
+        if (unlikely(attr_len > i_size))
+                attr_len = i_size;
+        kaddr = kmap_atomic(page, KM_USER0);
+        /* Copy the data to the page. */
+        memcpy(kaddr, (u8*)ctx->attr +
+                        le16_to_cpu(ctx->attr->data.resident.value_offset),
+                        attr_len);
+        /* Zero the remainder of the page. */
+        memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+        flush_dcache_page(page);
+        kunmap_atomic(kaddr, KM_USER0);
+put_unm_err_out:
+        ntfs_attr_put_search_ctx(ctx);
+unm_err_out:
+        unmap_mft_record(base_ni);
+done:
+        SetPageUptodate(page);
+err_out:
+        unlock_page(page);
+        return err;
+}
+#ifdef NTFS_RW
+/**
+ * ntfs_write_block - write a @page to the backing store
+ * @page:       page cache page to write out
+ * @wbc:        writeback control structure
+ *
+ * This function is for writing pages belonging to non-resident, non-mst
+ * protected attributes to their backing store.
+ *
+ * For a page with buffers, map and write the dirty buffers asynchronously
+ * under page writeback. For a page without buffers, create buffers for the
+ * page, then proceed as above.
+ *
+ * If a page doesn't have buffers the page dirty state is definitive. If a page
+ * does have buffers, the page dirty state is just a hint, and the buffer dirty
+ * state is definitive. (A hint which has rules: dirty buffers against a clean
+ * page is illegal. Other combinations are legal and need to be handled. In
+ * particular a dirty page containing clean buffers for example.)
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Based on ntfs_read_block() and __block_write_full_page().
+ */
+static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
+{
+        VCN vcn;
+        LCN lcn;
+        sector_t block, dblock, iblock;
+        struct inode *vi;
+        ntfs_inode *ni;
+        ntfs_volume *vol;
+        runlist_element *rl;
+        struct buffer_head *bh, *head;
+        unsigned int blocksize, vcn_ofs;
+        int err;
+        BOOL need_end_writeback;
+        unsigned char blocksize_bits;
+        vi = page->mapping->host;
+        ni = NTFS_I(vi);
+        vol = ni->vol;
+        ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
+                        "0x%lx.", ni->mft_no, ni->type, page->index);
+        BUG_ON(!NInoNonResident(ni));
+        BUG_ON(NInoMstProtected(ni));
+        blocksize_bits = vi->i_blkbits;
+        blocksize = 1 << blocksize_bits;
+        if (!page_has_buffers(page)) {
+                BUG_ON(!PageUptodate(page));
+                create_empty_buffers(page, blocksize,
+                                (1 << BH_Uptodate) | (1 << BH_Dirty));
+        }
+        bh = head = page_buffers(page);
+        if (unlikely(!bh)) {
+                ntfs_warning(vol->sb, "Error allocating page buffers. "
+                                "Redirtying page so we try again later.");
+                /*
+                 * Put the page back on mapping->dirty_pages, but leave its
+                 * buffer's dirty state as-is.
+                 */
+                redirty_page_for_writepage(wbc, page);
+                unlock_page(page);
+                return 0;
+        }
+        /* NOTE: Different naming scheme to ntfs_read_block()! */
+        /* The first block in the page. */
+        block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
+        /* The first out of bounds block for the data size. */
+        dblock = (vi->i_size + blocksize - 1) >> blocksize_bits;
+        /* The last (fully or partially) initialized block. */
+        iblock = ni->initialized_size >> blocksize_bits;
+        /*
+         * Be very careful.  We have no exclusion from __set_page_dirty_buffers
+         * here, and the (potentially unmapped) buffers may become dirty at
+         * any time.  If a buffer becomes dirty here after we've inspected it
+         * then we just miss that fact, and the page stays dirty.
+         *
+         * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
+         * handle that here by just cleaning them.
+         */
+        /*
+         * Loop through all the buffers in the page, mapping all the dirty
+         * buffers to disk addresses and handling any aliases from the
+         * underlying block device's mapping.
+         */
+        rl = NULL;
+        err = 0;
+        do {
+                BOOL is_retry = FALSE;
+                if (unlikely(block >= dblock)) {
+                        /*
+                         * Mapped buffers outside i_size will occur, because
+                         * this page can be outside i_size when there is a
+                         * truncate in progress. The contents of such buffers
+                         * were zeroed by ntfs_writepage().
+                         *
+                         * FIXME: What about the small race window where
+                         * ntfs_writepage() has not done any clearing because
+                         * the page was within i_size but before we get here,
+                         * vmtruncate() modifies i_size?
+                         */
+                        clear_buffer_dirty(bh);
+                        set_buffer_uptodate(bh);
+                        continue;
+                }
+                /* Clean buffers are not written out, so no need to map them. */
+                if (!buffer_dirty(bh))
+                        continue;
+                /* Make sure we have enough initialized size. */
+                if (unlikely((block >= iblock) &&
+                                (ni->initialized_size < vi->i_size))) {
+                        /*
+                         * If this page is fully outside initialized size, zero
+                         * out all pages between the current initialized size
+                         * and the current page. Just use ntfs_readpage() to do
+                         * the zeroing transparently.
+                         */
+                        if (block > iblock) {
+                                // TODO:
+                                // For each page do:
+                                // - read_cache_page()
+                                // Again for each page do:
+                                // - wait_on_page_locked()
+                                // - Check (PageUptodate(page) &&
+                                //                      !PageError(page))
+                                // Update initialized size in the attribute and
+                                // in the inode.
+                                // Again, for each page do:
+                                //      __set_page_dirty_buffers();
+                                // page_cache_release()
+                                // We don't need to wait on the writes.
+                                // Update iblock.
+                        }
+                        /*
+                         * The current page straddles initialized size. Zero
+                         * all non-uptodate buffers and set them uptodate (and
+                         * dirty?). Note, there aren't any non-uptodate buffers
+                         * if the page is uptodate.
+                         * FIXME: For an uptodate page, the buffers may need to
+                         * be written out because they were not initialized on
+                         * disk before.
+                         */
+                        if (!PageUptodate(page)) {
+                                // TODO:
+                                // Zero any non-uptodate buffers up to i_size.
+                                // Set them uptodate and dirty.
+                        }
+                        // TODO:
+                        // Update initialized size in the attribute and in the
+                        // inode (up to i_size).
+                        // Update iblock.
+                        // FIXME: This is inefficient. Try to batch the two
+                        // size changes to happen in one go.
+                        ntfs_error(vol->sb, "Writing beyond initialized size "
+                                        "is not supported yet. Sorry.");
+                        err = -EOPNOTSUPP;
+                        break;
+                        // Do NOT set_buffer_new() BUT DO clear buffer range
+                        // outside write request range.
+                        // set_buffer_uptodate() on complete buffers as well as
+                        // set_buffer_dirty().
+                }
+                /* No need to map buffers that are already mapped. */
+                if (buffer_mapped(bh))
+                        continue;
+                /* Unmapped, dirty buffer. Need to map it. */
+                bh->b_bdev = vol->sb->s_bdev;
+                /* Convert block into corresponding vcn and offset. */
+                vcn = (VCN)block << blocksize_bits;
+                vcn_ofs = vcn & vol->cluster_size_mask;
+                vcn >>= vol->cluster_size_bits;
+                if (!rl) {
+lock_retry_remap:
+                        down_read(&ni->runlist.lock);
+                        rl = ni->runlist.rl;
+                }
+                if (likely(rl != NULL)) {
+                        /* Seek to element containing target vcn. */
+                        while (rl->length && rl[1].vcn <= vcn)
+                                rl++;
+                        lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+                } else
+                        lcn = LCN_RL_NOT_MAPPED;
+                /* Successful remap. */
+                if (lcn >= 0) {
+                        /* Setup buffer head to point to correct block. */
+                        bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
+                                        vcn_ofs) >> blocksize_bits;
+                        set_buffer_mapped(bh);
+                        continue;
+                }
+                /* It is a hole, need to instantiate it. */
+                if (lcn == LCN_HOLE) {
+                        // TODO: Instantiate the hole.
+                        // clear_buffer_new(bh);
+                        // unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+                        ntfs_error(vol->sb, "Writing into sparse regions is "
+                                        "not supported yet. Sorry.");
+                        err = -EOPNOTSUPP;
+                        break;
+                }
+                /* If first try and runlist unmapped, map and retry. */
+                if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
+                        is_retry = TRUE;
+                        /*
+                         * Attempt to map runlist, dropping lock for
+                         * the duration.
+                         */
+                        up_read(&ni->runlist.lock);
+                        err = ntfs_map_runlist(ni, vcn);
+                        if (likely(!err))
+                                goto lock_retry_remap;
+                        rl = NULL;
+                        lcn = err;
+                }
+                /* Failed to map the buffer, even after retrying. */
+                bh->b_blocknr = -1;
+                ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
+                                "attribute type 0x%x, vcn 0x%llx, offset 0x%x "
+                                "because its location on disk could not be "
+                                "determined%s (error code %lli).", ni->mft_no,
+                                ni->type, (unsigned long long)vcn,
+                                vcn_ofs, is_retry ? " even after "
+                                "retrying" : "", (long long)lcn);
+                if (!err)
+                        err = -EIO;
+                break;
+        } while (block++, (bh = bh->b_this_page) != head);
+        /* Release the lock if we took it. */
+        if (rl)
+                up_read(&ni->runlist.lock);
+        /* For the error case, need to reset bh to the beginning. */
+        bh = head;
+        /* Just an optimization, so ->readpage() isn't called later. */
+        if (unlikely(!PageUptodate(page))) {
+                int uptodate = 1;
+                do {
+                        if (!buffer_uptodate(bh)) {
+                                uptodate = 0;
+                                bh = head;
+                                break;
+                        }
+                } while ((bh = bh->b_this_page) != head);
+                if (uptodate)
+                        SetPageUptodate(page);
+        }
+        /* Setup all mapped, dirty buffers for async write i/o. */
+        do {
+                get_bh(bh);
+                if (buffer_mapped(bh) && buffer_dirty(bh)) {
+                        lock_buffer(bh);
+                        if (test_clear_buffer_dirty(bh)) {
+                                BUG_ON(!buffer_uptodate(bh));
+                                mark_buffer_async_write(bh);
+                        } else
+                                unlock_buffer(bh);
+                } else if (unlikely(err)) {
+                        /*
+                         * For the error case. The buffer may have been set
+                         * dirty during attachment to a dirty page.
+                         */
+                        if (err != -ENOMEM)
+                                clear_buffer_dirty(bh);
+                }
+        } while ((bh = bh->b_this_page) != head);
+        if (unlikely(err)) {
+                // TODO: Remove the -EOPNOTSUPP check later on...
+                if (unlikely(err == -EOPNOTSUPP))
+                        err = 0;
+                else if (err == -ENOMEM) {
+                        ntfs_warning(vol->sb, "Error allocating memory. "
+                                        "Redirtying page so we try again "
+                                        "later.");
+                        /*
+                         * Put the page back on mapping->dirty_pages, but
+                         * leave its buffer's dirty state as-is.
+                         */
+                        redirty_page_for_writepage(wbc, page);
+                        err = 0;
+                } else
+                        SetPageError(page);
+        }
+        BUG_ON(PageWriteback(page));
+        set_page_writeback(page);       /* Keeps try_to_free_buffers() away. */
+        unlock_page(page);
+        /*
+         * Submit the prepared buffers for i/o. Note the page is unlocked,
+         * and the async write i/o completion handler can end_page_writeback()
+         * at any time after the *first* submit_bh(). So the buffers can then
+         * disappear...
+         */
+        need_end_writeback = TRUE;
+        do {
+                struct buffer_head *next = bh->b_this_page;
+                if (buffer_async_write(bh)) {
+                        submit_bh(WRITE, bh);
+                        need_end_writeback = FALSE;
+                }
+                put_bh(bh);
+                bh = next;
+        } while (bh != head);
+        /* If no i/o was started, need to end_page_writeback(). */
+        if (unlikely(need_end_writeback))
+                end_page_writeback(page);
+        ntfs_debug("Done.");
+        return err;
+}
+/**
+ * ntfs_write_mst_block - write a @page to the backing store
+ * @page:       page cache page to write out
+ * @wbc:        writeback control structure
+ *
+ * This function is for writing pages belonging to non-resident, mst protected
+ * attributes to their backing store.  The only supported attributes are index
+ * allocation and $MFT/$DATA.  Both directory inodes and index inodes are
+ * supported for the index allocation case.
+ *
+ * The page must remain locked for the duration of the write because we apply
+ * the mst fixups, write, and then undo the fixups, so if we were to unlock the
+ * page before undoing the fixups, any other user of the page will see the
+ * page contents as corrupt.
+ *
+ * We clear the page uptodate flag for the duration of the function to ensure
+ * exclusion for the $MFT/$DATA case against someone mapping an mft record we
+ * are about to apply the mst fixups to.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Based on ntfs_write_block(), ntfs_mft_writepage(), and
+ * write_mft_record_nolock().
+ */
+static int ntfs_write_mst_block(struct page *page,
+                struct writeback_control *wbc)
+{
+        sector_t block, dblock, rec_block;
+        struct inode *vi = page->mapping->host;
+        ntfs_inode *ni = NTFS_I(vi);
+        ntfs_volume *vol = ni->vol;
+        u8 *kaddr;
+        unsigned char bh_size_bits = vi->i_blkbits;
+        unsigned int bh_size = 1 << bh_size_bits;
+        unsigned int rec_size = ni->itype.index.block_size;
+        ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size];
+        struct buffer_head *bh, *head, *tbh, *rec_start_bh;
+        int max_bhs = PAGE_CACHE_SIZE / bh_size;
+        struct buffer_head *bhs[max_bhs];
+        runlist_element *rl;
+        int i, nr_locked_nis, nr_recs, nr_bhs, bhs_per_rec, err, err2;
+        unsigned rec_size_bits;
+        BOOL sync, is_mft, page_is_dirty, rec_is_dirty;
+        ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
+                        "0x%lx.", vi->i_ino, ni->type, page->index);
+        BUG_ON(!NInoNonResident(ni));
+        BUG_ON(!NInoMstProtected(ni));
+        is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
+        /*
+         * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
+         * in its page cache were to be marked dirty.  However this should
+         * never happen with the current driver and considering we do not
+         * handle this case here we do want to BUG(), at least for now.
+         */
+        BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
+                        (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
+        BUG_ON(!max_bhs);
+        /* Were we called for sync purposes? */
+        sync = (wbc->sync_mode == WB_SYNC_ALL);
+        /* Make sure we have mapped buffers. */
+        BUG_ON(!page_has_buffers(page));
+        bh = head = page_buffers(page);
+        BUG_ON(!bh);
+        rec_size_bits = ni->itype.index.block_size_bits;
+        BUG_ON(!(PAGE_CACHE_SIZE >> rec_size_bits));
+        bhs_per_rec = rec_size >> bh_size_bits;
+        BUG_ON(!bhs_per_rec);
+        /* The first block in the page. */
+        rec_block = block = (sector_t)page->index <<
+                        (PAGE_CACHE_SHIFT - bh_size_bits);
+        /* The first out of bounds block for the data size. */
+        dblock = (vi->i_size + bh_size - 1) >> bh_size_bits;
+        rl = NULL;
+        err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
+        page_is_dirty = rec_is_dirty = FALSE;
+        rec_start_bh = NULL;
+        do {
+                BOOL is_retry = FALSE;
+                if (likely(block < rec_block)) {
+                        if (unlikely(block >= dblock)) {
+                                clear_buffer_dirty(bh);
+                                continue;
+                        }
+                        /*
+                         * This block is not the first one in the record.  We
+                         * ignore the buffer's dirty state because we could
+                         * have raced with a parallel mark_ntfs_record_dirty().
+                         */
+                        if (!rec_is_dirty)
+                                continue;
+                        if (unlikely(err2)) {
+                                if (err2 != -ENOMEM)
+                                        clear_buffer_dirty(bh);
+                                continue;
+                        }
+                } else /* if (block == rec_block) */ {
+                        BUG_ON(block > rec_block);
+                        /* This block is the first one in the record. */
+                        rec_block += bhs_per_rec;
+                        err2 = 0;
+                        if (unlikely(block >= dblock)) {
+                                clear_buffer_dirty(bh);
+                                continue;
+                        }
+                        if (!buffer_dirty(bh)) {
+                                /* Clean records are not written out. */
+                                rec_is_dirty = FALSE;
+                                continue;
+                        }
+                        rec_is_dirty = TRUE;
+                        rec_start_bh = bh;
+                }
+                /* Need to map the buffer if it is not mapped already. */
+                if (unlikely(!buffer_mapped(bh))) {
+                        VCN vcn;
+                        LCN lcn;
+                        unsigned int vcn_ofs;
+                        /* Obtain the vcn and offset of the current block. */
+                        vcn = (VCN)block << bh_size_bits;
+                        vcn_ofs = vcn & vol->cluster_size_mask;
+                        vcn >>= vol->cluster_size_bits;
+                        if (!rl) {
+lock_retry_remap:
+                                down_read(&ni->runlist.lock);
+                                rl = ni->runlist.rl;
+                        }
+                        if (likely(rl != NULL)) {
+                                /* Seek to element containing target vcn. */
+                                while (rl->length && rl[1].vcn <= vcn)
+                                        rl++;
+                                lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+                        } else
+                                lcn = LCN_RL_NOT_MAPPED;
+                        /* Successful remap. */
+                        if (likely(lcn >= 0)) {
+                                /* Setup buffer head to correct block. */
+                                bh->b_blocknr = ((lcn <<
+                                                vol->cluster_size_bits) +
+                                                vcn_ofs) >> bh_size_bits;
+                                set_buffer_mapped(bh);
+                        } else {
+                                /*
+                                 * Remap failed.  Retry to map the runlist once
+                                 * unless we are working on $MFT which always
+                                 * has the whole of its runlist in memory.
+                                 */
+                                if (!is_mft && !is_retry &&
+                                                lcn == LCN_RL_NOT_MAPPED) {
+                                        is_retry = TRUE;
+                                        /*
+                                         * Attempt to map runlist, dropping
+                                         * lock for the duration.
+                                         */
+                                        up_read(&ni->runlist.lock);
+                                        err2 = ntfs_map_runlist(ni, vcn);
+                                        if (likely(!err2))
+                                                goto lock_retry_remap;
+                                        if (err2 == -ENOMEM)
+                                                page_is_dirty = TRUE;
+                                        lcn = err2;
+                                } else
+                                        err2 = -EIO;
+                                /* Hard error.  Abort writing this record. */
+                                if (!err || err == -ENOMEM)
+                                        err = err2;
+                                bh->b_blocknr = -1;
+                                ntfs_error(vol->sb, "Cannot write ntfs record "
+                                                "0x%llx (inode 0x%lx, "
+                                                "attribute type 0x%x) because "
+                                                "its location on disk could "
+                                                "not be determined (error "
+                                                "code %lli).", (s64)block <<
+                                                bh_size_bits >>
+                                                vol->mft_record_size_bits,
+                                                ni->mft_no, ni->type,
+                                                (long long)lcn);
+                                /*
+                                 * If this is not the first buffer, remove the
+                                 * buffers in this record from the list of
+                                 * buffers to write and clear their dirty bit
+                                 * if not error -ENOMEM.
+                                 */
+                                if (rec_start_bh != bh) {
+                                        while (bhs[--nr_bhs] != rec_start_bh)
+                                                ;
+                                        if (err2 != -ENOMEM) {
+                                                do {
+                                                        clear_buffer_dirty(
+                                                                rec_start_bh);
+                                                } while ((rec_start_bh =
+                                                                rec_start_bh->
+                                                                b_this_page) !=
+                                                                bh);
+                                        }
+                                }
+                                continue;
+                        }
+                }
+                BUG_ON(!buffer_uptodate(bh));
+                BUG_ON(nr_bhs >= max_bhs);
+                bhs[nr_bhs++] = bh;
+        } while (block++, (bh = bh->b_this_page) != head);
+        if (unlikely(rl))
+                up_read(&ni->runlist.lock);
+        /* If there were no dirty buffers, we are done. */
+        if (!nr_bhs)
+                goto done;
+        /* Map the page so we can access its contents. */
+        kaddr = kmap(page);
+        /* Clear the page uptodate flag whilst the mst fixups are applied. */
+        BUG_ON(!PageUptodate(page));
+        ClearPageUptodate(page);
+        for (i = 0; i < nr_bhs; i++) {
+                unsigned int ofs;
+                /* Skip buffers which are not at the beginning of records. */
+                if (i % bhs_per_rec)
+                        continue;
+                tbh = bhs[i];
+                ofs = bh_offset(tbh);
+                if (is_mft) {
+                        ntfs_inode *tni;
+                        unsigned long mft_no;
+                        /* Get the mft record number. */
+                        mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
+                                        >> rec_size_bits;
+                        /* Check whether to write this mft record. */
+                        tni = NULL;
+                        if (!ntfs_may_write_mft_record(vol, mft_no,
+                                        (MFT_RECORD*)(kaddr + ofs), &tni)) {
+                                /*
+                                 * The record should not be written.  This
+                                 * means we need to redirty the page before
+                                 * returning.
+                                 */
+                                page_is_dirty = TRUE;
+                                /*
+                                 * Remove the buffers in this mft record from
+                                 * the list of buffers to write.
+                                 */
+                                do {
+                                        bhs[i] = NULL;
+                                } while (++i % bhs_per_rec);
+                                continue;
+                        }
+                        /*
+                         * The record should be written.  If a locked ntfs
+                         * inode was returned, add it to the array of locked
+                         * ntfs inodes.
+                         */
+                        if (tni)
+                                locked_nis[nr_locked_nis++] = tni;
+                }
+                /* Apply the mst protection fixups. */
+                err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
+                                rec_size);
+                if (unlikely(err2)) {
+                        if (!err || err == -ENOMEM)
+                                err = -EIO;
+                        ntfs_error(vol->sb, "Failed to apply mst fixups "
+                                        "(inode 0x%lx, attribute type 0x%x, "
+                                        "page index 0x%lx, page offset 0x%x)!"
+                                        "  Unmount and run chkdsk.", vi->i_ino,
+                                        ni->type, page->index, ofs);
+                        /*
+                         * Mark all the buffers in this record clean as we do
+                         * not want to write corrupt data to disk.
+                         */
+                        do {
+                                clear_buffer_dirty(bhs[i]);
+                                bhs[i] = NULL;
+                        } while (++i % bhs_per_rec);
+                        continue;
+                }
+                nr_recs++;
+        }
+        /* If no records are to be written out, we are done. */
+        if (!nr_recs)
+                goto unm_done;
+        flush_dcache_page(page);
+        /* Lock buffers and start synchronous write i/o on them. */
+        for (i = 0; i < nr_bhs; i++) {
+                tbh = bhs[i];
+                if (!tbh)
+                        continue;
+                if (unlikely(test_set_buffer_locked(tbh)))
+                        BUG();
+                /* The buffer dirty state is now irrelevant, just clean it. */
+                clear_buffer_dirty(tbh);
+                BUG_ON(!buffer_uptodate(tbh));
+                BUG_ON(!buffer_mapped(tbh));
+                get_bh(tbh);
+                tbh->b_end_io = end_buffer_write_sync;
+                submit_bh(WRITE, tbh);
+        }
+        /* Synchronize the mft mirror now if not @sync. */
+        if (is_mft && !sync)
+                goto do_mirror;
+do_wait:
+        /* Wait on i/o completion of buffers. */
+        for (i = 0; i < nr_bhs; i++) {
+                tbh = bhs[i];
+                if (!tbh)
+                        continue;
+                wait_on_buffer(tbh);
+                if (unlikely(!buffer_uptodate(tbh))) {
+                        ntfs_error(vol->sb, "I/O error while writing ntfs "
+                                        "record buffer (inode 0x%lx, "
+                                        "attribute type 0x%x, page index "
+                                        "0x%lx, page offset 0x%lx)!  Unmount "
+                                        "and run chkdsk.", vi->i_ino, ni->type,
+                                        page->index, bh_offset(tbh));
+                        if (!err || err == -ENOMEM)
+                                err = -EIO;
+                        /*
+                         * Set the buffer uptodate so the page and buffer
+                         * states do not become out of sync.
+                         */
+                        set_buffer_uptodate(tbh);
+                }
+        }
+        /* If @sync, now synchronize the mft mirror. */
+        if (is_mft && sync) {
+do_mirror:
+                for (i = 0; i < nr_bhs; i++) {
+                        unsigned long mft_no;
+                        unsigned int ofs;
+                        /*
+                         * Skip buffers which are not at the beginning of
+                         * records.
+                         */
+                        if (i % bhs_per_rec)
+                                continue;
+                        tbh = bhs[i];
+                        /* Skip removed buffers (and hence records). */
+                        if (!tbh)
+                                continue;
+                        ofs = bh_offset(tbh);
+                        /* Get the mft record number. */
+                        mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
+                                        >> rec_size_bits;
+                        if (mft_no < vol->mftmirr_size)
+                                ntfs_sync_mft_mirror(vol, mft_no,
+                                                (MFT_RECORD*)(kaddr + ofs),
+                                                sync);
+                }
+                if (!sync)
+                        goto do_wait;
+        }
+        /* Remove the mst protection fixups again. */
+        for (i = 0; i < nr_bhs; i++) {
+                if (!(i % bhs_per_rec)) {
+                        tbh = bhs[i];
+                        if (!tbh)
+                                continue;
+                        post_write_mst_fixup((NTFS_RECORD*)(kaddr +
+                                        bh_offset(tbh)));
+                }
+        }
+        flush_dcache_page(page);
+unm_done:
+        /* Unlock any locked inodes. */
+        while (nr_locked_nis-- > 0) {
+                ntfs_inode *tni, *base_tni;
+                
+                tni = locked_nis[nr_locked_nis];
+                /* Get the base inode. */
+                down(&tni->extent_lock);
+                if (tni->nr_extents >= 0)
+                        base_tni = tni;
+                else {
+                        base_tni = tni->ext.base_ntfs_ino;
+                        BUG_ON(!base_tni);
+                }
+                up(&tni->extent_lock);
+                ntfs_debug("Unlocking %s inode 0x%lx.",
+                                tni == base_tni ? "base" : "extent",
+                                tni->mft_no);
+                up(&tni->mrec_lock);
+                atomic_dec(&tni->count);
+                iput(VFS_I(base_tni));
+        }
+        SetPageUptodate(page);
+        kunmap(page);
+done:
+        if (unlikely(err && err != -ENOMEM)) {
+                /*
+                 * Set page error if there is only one ntfs record in the page.
+                 * Otherwise we would loose per-record granularity.
+                 */
+                if (ni->itype.index.block_size == PAGE_CACHE_SIZE)
+                        SetPageError(page);
+                NVolSetErrors(vol);
+        }
+        if (page_is_dirty) {
+                ntfs_debug("Page still contains one or more dirty ntfs "
+                                "records.  Redirtying the page starting at "
+                                "record 0x%lx.", page->index <<
+                                (PAGE_CACHE_SHIFT - rec_size_bits));
+                redirty_page_for_writepage(wbc, page);
+                unlock_page(page);
+        } else {
+                /*
+                 * Keep the VM happy.  This must be done otherwise the
+                 * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
+                 * the page is clean.
+                 */
+                BUG_ON(PageWriteback(page));
+                set_page_writeback(page);
+                unlock_page(page);
+                end_page_writeback(page);
+        }
+        if (likely(!err))
+                ntfs_debug("Done.");
+        return err;
+}
+/**
+ * ntfs_writepage - write a @page to the backing store
+ * @page:       page cache page to write out
+ * @wbc:        writeback control structure
+ *
+ * This is called from the VM when it wants to have a dirty ntfs page cache
+ * page cleaned.  The VM has already locked the page and marked it clean.
+ *
+ * For non-resident attributes, ntfs_writepage() writes the @page by calling
+ * the ntfs version of the generic block_write_full_page() function,
+ * ntfs_write_block(), which in turn if necessary creates and writes the
+ * buffers associated with the page asynchronously.
+ *
+ * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
+ * the data to the mft record (which at this stage is most likely in memory).
+ * The mft record is then marked dirty and written out asynchronously via the
+ * vfs inode dirty code path for the inode the mft record belongs to or via the
+ * vm page dirty code path for the page the mft record is in.
+ *
+ * Based on ntfs_readpage() and fs/buffer.c::block_write_full_page().
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        loff_t i_size;
+        struct inode *vi;
+        ntfs_inode *ni, *base_ni;
+        char *kaddr;
+        ntfs_attr_search_ctx *ctx;
+        MFT_RECORD *m;
+        u32 attr_len;
+        int err;
+        BUG_ON(!PageLocked(page));
+        vi = page->mapping->host;
+        i_size = i_size_read(vi);
+        /* Is the page fully outside i_size? (truncate in progress) */
+        if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
+                        PAGE_CACHE_SHIFT)) {
+                /*
+                 * The page may have dirty, unmapped buffers.  Make them
+                 * freeable here, so the page does not leak.
+                 */
+                block_invalidatepage(page, 0);
+                unlock_page(page);
+                ntfs_debug("Write outside i_size - truncated?");
+                return 0;
+        }
+        ni = NTFS_I(vi);
+        /* NInoNonResident() == NInoIndexAllocPresent() */
+        if (NInoNonResident(ni)) {
+                /*
+                 * Only unnamed $DATA attributes can be compressed, encrypted,
+                 * and/or sparse.
+                 */
+                if (ni->type == AT_DATA && !ni->name_len) {
+                        /* If file is encrypted, deny access, just like NT4. */
+                        if (NInoEncrypted(ni)) {
+                                unlock_page(page);
+                                ntfs_debug("Denying write access to encrypted "
+                                                "file.");
+                                return -EACCES;
+                        }
+                        /* Compressed data streams are handled in compress.c. */
+                        if (NInoCompressed(ni)) {
+                                // TODO: Implement and replace this check with
+                                // return ntfs_write_compressed_block(page);
+                                unlock_page(page);
+                                ntfs_error(vi->i_sb, "Writing to compressed "
+                                                "files is not supported yet. "
+                                                "Sorry.");
+                                return -EOPNOTSUPP;
+                        }
+                        // TODO: Implement and remove this check.
+                        if (NInoSparse(ni)) {
+                                unlock_page(page);
+                                ntfs_error(vi->i_sb, "Writing to sparse files "
+                                                "is not supported yet. Sorry.");
+                                return -EOPNOTSUPP;
+                        }
+                }
+                /* We have to zero every time due to mmap-at-end-of-file. */
+                if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) {
+                        /* The page straddles i_size. */
+                        unsigned int ofs = i_size & ~PAGE_CACHE_MASK;
+                        kaddr = kmap_atomic(page, KM_USER0);
+                        memset(kaddr + ofs, 0, PAGE_CACHE_SIZE - ofs);
+                        flush_dcache_page(page);
+                        kunmap_atomic(kaddr, KM_USER0);
+                }
+                /* Handle mst protected attributes. */
+                if (NInoMstProtected(ni))
+                        return ntfs_write_mst_block(page, wbc);
+                /* Normal data stream. */
+                return ntfs_write_block(page, wbc);
+        }
+        /*
+         * Attribute is resident, implying it is not compressed, encrypted,
+         * sparse, or mst protected.  This also means the attribute is smaller
+         * than an mft record and hence smaller than a page, so can simply
+         * return error on any pages with index above 0.
+         */
+        BUG_ON(page_has_buffers(page));
+        BUG_ON(!PageUptodate(page));
+        if (unlikely(page->index > 0)) {
+                ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0.  "
+                                "Aborting write.", page->index);
+                BUG_ON(PageWriteback(page));
+                set_page_writeback(page);
+                unlock_page(page);
+                end_page_writeback(page);
+                return -EIO;
+        }
+        if (!NInoAttr(ni))
+                base_ni = ni;
+        else
+                base_ni = ni->ext.base_ntfs_ino;
+        /* Map, pin, and lock the mft record. */
+        m = map_mft_record(base_ni);
+        if (IS_ERR(m)) {
+                err = PTR_ERR(m);
+                m = NULL;
+                ctx = NULL;
+                goto err_out;
+        }
+        ctx = ntfs_attr_get_search_ctx(base_ni, m);
+        if (unlikely(!ctx)) {
+                err = -ENOMEM;
+                goto err_out;
+        }
+        err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+                        CASE_SENSITIVE, 0, NULL, 0, ctx);
+        if (unlikely(err))
+                goto err_out;
+        /*
+         * Keep the VM happy.  This must be done otherwise the radix-tree tag
+         * PAGECACHE_TAG_DIRTY remains set even though the page is clean.
+         */
+        BUG_ON(PageWriteback(page));
+        set_page_writeback(page);
+        unlock_page(page);
+        /*
+         * Here, we don't need to zero the out of bounds area everytime because
+         * the below memcpy() already takes care of the mmap-at-end-of-file
+         * requirements. If the file is converted to a non-resident one, then
+         * the code path use is switched to the non-resident one where the
+         * zeroing happens on each ntfs_writepage() invocation.
+         *
+         * The above also applies nicely when i_size is decreased.
+         *
+         * When i_size is increased, the memory between the old and new i_size
+         * _must_ be zeroed (or overwritten with new data). Otherwise we will
+         * expose data to userspace/disk which should never have been exposed.
+         *
+         * FIXME: Ensure that i_size increases do the zeroing/overwriting and
+         * if we cannot guarantee that, then enable the zeroing below.  If the
+         * zeroing below is enabled, we MUST move the unlock_page() from above
+         * to after the kunmap_atomic(), i.e. just before the
+         * end_page_writeback().
+         * UPDATE: ntfs_prepare/commit_write() do the zeroing on i_size
+         * increases for resident attributes so those are ok.
+         * TODO: ntfs_truncate(), others?
+         */
+        attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
+        i_size = i_size_read(VFS_I(ni));
+        kaddr = kmap_atomic(page, KM_USER0);
+        if (unlikely(attr_len > i_size)) {
+                /* Zero out of bounds area in the mft record. */
+                memset((u8*)ctx->attr + le16_to_cpu(
+                                ctx->attr->data.resident.value_offset) +
+                                i_size, 0, attr_len - i_size);
+                attr_len = i_size;
+        }
+        /* Copy the data from the page to the mft record. */
+        memcpy((u8*)ctx->attr +
+                        le16_to_cpu(ctx->attr->data.resident.value_offset),
+                        kaddr, attr_len);
+        flush_dcache_mft_record_page(ctx->ntfs_ino);
+        /* Zero out of bounds area in the page cache page. */
+        memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+        flush_dcache_page(page);
+        kunmap_atomic(kaddr, KM_USER0);
+        end_page_writeback(page);
+        /* Mark the mft record dirty, so it gets written back. */
+        mark_mft_record_dirty(ctx->ntfs_ino);
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(base_ni);
+        return 0;
+err_out:
+        if (err == -ENOMEM) {
+                ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
+                                "page so we try again later.");
+                /*
+                 * Put the page back on mapping->dirty_pages, but leave its
+                 * buffers' dirty state as-is.
+                 */
+                redirty_page_for_writepage(wbc, page);
+                err = 0;
+        } else {
+                ntfs_error(vi->i_sb, "Resident attribute write failed with "
+                                "error %i.  Setting page error flag.", err);
+                SetPageError(page);
+        }
+        unlock_page(page);
+        if (ctx)
+                ntfs_attr_put_search_ctx(ctx);
+        if (m)
+                unmap_mft_record(base_ni);
+        return err;
+}
+/**
+ * ntfs_prepare_nonresident_write -
+ *
+ */
+static int ntfs_prepare_nonresident_write(struct page *page,
+                unsigned from, unsigned to)
+{
+        VCN vcn;
+        LCN lcn;
+        sector_t block, ablock, iblock;
+        struct inode *vi;
+        ntfs_inode *ni;
+        ntfs_volume *vol;
+        runlist_element *rl;
+        struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
+        unsigned int vcn_ofs, block_start, block_end, blocksize;
+        int err;
+        BOOL is_retry;
+        unsigned char blocksize_bits;
+        vi = page->mapping->host;
+        ni = NTFS_I(vi);
+        vol = ni->vol;
+        ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
+                        "0x%lx, from = %u, to = %u.", ni->mft_no, ni->type,
+                        page->index, from, to);
+        BUG_ON(!NInoNonResident(ni));
+        blocksize_bits = vi->i_blkbits;
+        blocksize = 1 << blocksize_bits;
+        /*
+         * create_empty_buffers() will create uptodate/dirty buffers if the
+         * page is uptodate/dirty.
+         */
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, blocksize, 0);
+        bh = head = page_buffers(page);
+        if (unlikely(!bh))
+                return -ENOMEM;
+        /* The first block in the page. */
+        block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
+        /*
+         * The first out of bounds block for the allocated size. No need to
+         * round up as allocated_size is in multiples of cluster size and the
+         * minimum cluster size is 512 bytes, which is equal to the smallest
+         * blocksize.
+         */
+        ablock = ni->allocated_size >> blocksize_bits;
+        /* The last (fully or partially) initialized block. */
+        iblock = ni->initialized_size >> blocksize_bits;
+        /* Loop through all the buffers in the page. */
+        block_start = 0;
+        rl = NULL;
+        err = 0;
+        do {
+                block_end = block_start + blocksize;
+                /*
+                 * If buffer @bh is outside the write, just mark it uptodate
+                 * if the page is uptodate and continue with the next buffer.
+                 */
+                if (block_end <= from || block_start >= to) {
+                        if (PageUptodate(page)) {
+                                if (!buffer_uptodate(bh))
+                                        set_buffer_uptodate(bh);
+                        }
+                        continue;
+                }
+                /*
+                 * @bh is at least partially being written to.
+                 * Make sure it is not marked as new.
+                 */
+                //if (buffer_new(bh))
+                //      clear_buffer_new(bh);
+                if (block >= ablock) {
+                        // TODO: block is above allocated_size, need to
+                        // allocate it. Best done in one go to accommodate not
+                        // only block but all above blocks up to and including:
+                        // ((page->index << PAGE_CACHE_SHIFT) + to + blocksize
+                        // - 1) >> blobksize_bits. Obviously will need to round
+                        // up to next cluster boundary, too. This should be
+                        // done with a helper function, so it can be reused.
+                        ntfs_error(vol->sb, "Writing beyond allocated size "
+                                        "is not supported yet. Sorry.");
+                        err = -EOPNOTSUPP;
+                        goto err_out;
+                        // Need to update ablock.
+                        // Need to set_buffer_new() on all block bhs that are
+                        // newly allocated.
+                }
+                /*
+                 * Now we have enough allocated size to fulfill the whole
+                 * request, i.e. block < ablock is true.
+                 */
+                if (unlikely((block >= iblock) &&
+                                (ni->initialized_size < vi->i_size))) {
+                        /*
+                         * If this page is fully outside initialized size, zero
+                         * out all pages between the current initialized size
+                         * and the current page. Just use ntfs_readpage() to do
+                         * the zeroing transparently.
+                         */
+                        if (block > iblock) {
+                                // TODO:
+                                // For each page do:
+                                // - read_cache_page()
+                                // Again for each page do:
+                                // - wait_on_page_locked()
+                                // - Check (PageUptodate(page) &&
+                                //                      !PageError(page))
+                                // Update initialized size in the attribute and
+                                // in the inode.
+                                // Again, for each page do:
+                                //      __set_page_dirty_buffers();
+                                // page_cache_release()
+                                // We don't need to wait on the writes.
+                                // Update iblock.
+                        }
+                        /*
+                         * The current page straddles initialized size. Zero
+                         * all non-uptodate buffers and set them uptodate (and
+                         * dirty?). Note, there aren't any non-uptodate buffers
+                         * if the page is uptodate.
+                         * FIXME: For an uptodate page, the buffers may need to
+                         * be written out because they were not initialized on
+                         * disk before.
+                         */
+                        if (!PageUptodate(page)) {
+                                // TODO:
+                                // Zero any non-uptodate buffers up to i_size.
+                                // Set them uptodate and dirty.
+                        }
+                        // TODO:
+                        // Update initialized size in the attribute and in the
+                        // inode (up to i_size).
+                        // Update iblock.
+                        // FIXME: This is inefficient. Try to batch the two
+                        // size changes to happen in one go.
+                        ntfs_error(vol->sb, "Writing beyond initialized size "
+                                        "is not supported yet. Sorry.");
+                        err = -EOPNOTSUPP;
+                        goto err_out;
+                        // Do NOT set_buffer_new() BUT DO clear buffer range
+                        // outside write request range.
+                        // set_buffer_uptodate() on complete buffers as well as
+                        // set_buffer_dirty().
+                }
+                /* Need to map unmapped buffers. */
+                if (!buffer_mapped(bh)) {
+                        /* Unmapped buffer. Need to map it. */
+                        bh->b_bdev = vol->sb->s_bdev;
+                        /* Convert block into corresponding vcn and offset. */
+                        vcn = (VCN)block << blocksize_bits >>
+                                        vol->cluster_size_bits;
+                        vcn_ofs = ((VCN)block << blocksize_bits) &
+                                        vol->cluster_size_mask;
+                        is_retry = FALSE;
+                        if (!rl) {
+lock_retry_remap:
+                                down_read(&ni->runlist.lock);
+                                rl = ni->runlist.rl;
+                        }
+                        if (likely(rl != NULL)) {
+                                /* Seek to element containing target vcn. */
+                                while (rl->length && rl[1].vcn <= vcn)
+                                        rl++;
+                                lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+                        } else
+                                lcn = LCN_RL_NOT_MAPPED;
+                        if (unlikely(lcn < 0)) {
+                                /*
+                                 * We extended the attribute allocation above.
+                                 * If we hit an ENOENT here it means that the
+                                 * allocation was insufficient which is a bug.
+                                 */
+                                BUG_ON(lcn == LCN_ENOENT);
+                                /* It is a hole, need to instantiate it. */
+                                if (lcn == LCN_HOLE) {
+                                        // TODO: Instantiate the hole.
+                                        // clear_buffer_new(bh);
+                                        // unmap_underlying_metadata(bh->b_bdev,
+                                        //              bh->b_blocknr);
+                                        // For non-uptodate buffers, need to
+                                        // zero out the region outside the
+                                        // request in this bh or all bhs,
+                                        // depending on what we implemented
+                                        // above.
+                                        // Need to flush_dcache_page().
+                                        // Or could use set_buffer_new()
+                                        // instead?
+                                        ntfs_error(vol->sb, "Writing into "
+                                                        "sparse regions is "
+                                                        "not supported yet. "
+                                                        "Sorry.");
+                                        err = -EOPNOTSUPP;
+                                        goto err_out;
+                                } else if (!is_retry &&
+                                                lcn == LCN_RL_NOT_MAPPED) {
+                                        is_retry = TRUE;
+                                        /*
+                                         * Attempt to map runlist, dropping
+                                         * lock for the duration.
+                                         */
+                                        up_read(&ni->runlist.lock);
+                                        err = ntfs_map_runlist(ni, vcn);
+                                        if (likely(!err))
+                                                goto lock_retry_remap;
+                                        rl = NULL;
+                                        lcn = err;
+                                }
+                                /*
+                                 * Failed to map the buffer, even after
+                                 * retrying.
+                                 */
+                                bh->b_blocknr = -1;
+                                ntfs_error(vol->sb, "Failed to write to inode "
+                                                "0x%lx, attribute type 0x%x, "
+                                                "vcn 0x%llx, offset 0x%x "
+                                                "because its location on disk "
+                                                "could not be determined%s "
+                                                "(error code %lli).",
+                                                ni->mft_no, ni->type,
+                                                (unsigned long long)vcn,
+                                                vcn_ofs, is_retry ? " even "
+                                                "after retrying" : "",
+                                                (long long)lcn);
+                                if (!err)
+                                        err = -EIO;
+                                goto err_out;
+                        }
+                        /* We now have a successful remap, i.e. lcn >= 0. */
+                        /* Setup buffer head to correct block. */
+                        bh->b_blocknr = ((lcn << vol->cluster_size_bits)
+                                        + vcn_ofs) >> blocksize_bits;
+                        set_buffer_mapped(bh);
+                        // FIXME: Something analogous to this is needed for
+                        // each newly allocated block, i.e. BH_New.
+                        // FIXME: Might need to take this out of the
+                        // if (!buffer_mapped(bh)) {}, depending on how we
+                        // implement things during the allocated_size and
+                        // initialized_size extension code above.
+                        if (buffer_new(bh)) {
+                                clear_buffer_new(bh);
+                                unmap_underlying_metadata(bh->b_bdev,
+                                                bh->b_blocknr);
+                                if (PageUptodate(page)) {
+                                        set_buffer_uptodate(bh);
+                                        continue;
+                                }
+                                /*
+                                 * Page is _not_ uptodate, zero surrounding
+                                 * region. NOTE: This is how we decide if to
+                                 * zero or not!
+                                 */
+                                if (block_end > to || block_start < from) {
+                                        void *kaddr;
+                                        kaddr = kmap_atomic(page, KM_USER0);
+                                        if (block_end > to)
+                                                memset(kaddr + to, 0,
+                                                                block_end - to);
+                                        if (block_start < from)
+                                                memset(kaddr + block_start, 0,
+                                                                from -
+                                                                block_start);
+                                        flush_dcache_page(page);
+                                        kunmap_atomic(kaddr, KM_USER0);
+                                }
+                                continue;
+                        }
+                }
+                /* @bh is mapped, set it uptodate if the page is uptodate. */
+                if (PageUptodate(page)) {
+                        if (!buffer_uptodate(bh))
+                                set_buffer_uptodate(bh);
+                        continue;
+                }
+                /*
+                 * The page is not uptodate. The buffer is mapped. If it is not
+                 * uptodate, and it is only partially being written to, we need
+                 * to read the buffer in before the write, i.e. right now.
+                 */
+                if (!buffer_uptodate(bh) &&
+                                (block_start < from || block_end > to)) {
+                        ll_rw_block(READ, 1, &bh);
+                        *wait_bh++ = bh;
+                }
+        } while (block++, block_start = block_end,
+                        (bh = bh->b_this_page) != head);
+        /* Release the lock if we took it. */
+        if (rl) {
+                up_read(&ni->runlist.lock);
+                rl = NULL;
+        }
+        /* If we issued read requests, let them complete. */
+        while (wait_bh > wait) {
+                wait_on_buffer(*--wait_bh);
+                if (!buffer_uptodate(*wait_bh))
+                        return -EIO;
+        }
+        ntfs_debug("Done.");
+        return 0;
+err_out:
+        /*
+         * Zero out any newly allocated blocks to avoid exposing stale data.
+         * If BH_New is set, we know that the block was newly allocated in the
+         * above loop.
+         * FIXME: What about initialized_size increments? Have we done all the
+         * required zeroing above? If not this error handling is broken, and
+         * in particular the if (block_end <= from) check is completely bogus.
+         */
+        bh = head;
+        block_start = 0;
+        is_retry = FALSE;
+        do {
+                block_end = block_start + blocksize;
+                if (block_end <= from)
+                        continue;
+                if (block_start >= to)
+                        break;
+                if (buffer_new(bh)) {
+                        void *kaddr;
+                        clear_buffer_new(bh);
+                        kaddr = kmap_atomic(page, KM_USER0);
+                        memset(kaddr + block_start, 0, bh->b_size);
+                        kunmap_atomic(kaddr, KM_USER0);
+                        set_buffer_uptodate(bh);
+                        mark_buffer_dirty(bh);
+                        is_retry = TRUE;
+                }
+        } while (block_start = block_end, (bh = bh->b_this_page) != head);
+        if (is_retry)
+                flush_dcache_page(page);
+        if (rl)
+                up_read(&ni->runlist.lock);
+        return err;
+}
+/**
+ * ntfs_prepare_write - prepare a page for receiving data
+ *
+ * This is called from generic_file_write() with i_sem held on the inode
+ * (@page->mapping->host).  The @page is locked but not kmap()ped.  The source
+ * data has not yet been copied into the @page.
+ *
+ * Need to extend the attribute/fill in holes if necessary, create blocks and
+ * make partially overwritten blocks uptodate,
+ *
+ * i_size is not to be modified yet.
+ *
+ * Return 0 on success or -errno on error.
+ *
+ * Should be using block_prepare_write() [support for sparse files] or
+ * cont_prepare_write() [no support for sparse files].  Cannot do that due to
+ * ntfs specifics but can look at them for implementation guidance.
+ *
+ * Note: In the range, @from is inclusive and @to is exclusive, i.e. @from is
+ * the first byte in the page that will be written to and @to is the first byte
+ * after the last byte that will be written to.
+ */
+static int ntfs_prepare_write(struct file *file, struct page *page,
+                unsigned from, unsigned to)
+{
+        s64 new_size;
+        struct inode *vi = page->mapping->host;
+        ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
+        ntfs_volume *vol = ni->vol;
+        ntfs_attr_search_ctx *ctx = NULL;
+        MFT_RECORD *m = NULL;
+        ATTR_RECORD *a;
+        u8 *kaddr;
+        u32 attr_len;
+        int err;
+        ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
+                        "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
+                        page->index, from, to);
+        BUG_ON(!PageLocked(page));
+        BUG_ON(from > PAGE_CACHE_SIZE);
+        BUG_ON(to > PAGE_CACHE_SIZE);
+        BUG_ON(from > to);
+        BUG_ON(NInoMstProtected(ni));
+        /*
+         * If a previous ntfs_truncate() failed, repeat it and abort if it
+         * fails again.
+         */
+        if (unlikely(NInoTruncateFailed(ni))) {
+                down_write(&vi->i_alloc_sem);
+                err = ntfs_truncate(vi);
+                up_write(&vi->i_alloc_sem);
+                if (err || NInoTruncateFailed(ni)) {
+                        if (!err)
+                                err = -EIO;
+                        goto err_out;
+                }
+        }
+        /* If the attribute is not resident, deal with it elsewhere. */
+        if (NInoNonResident(ni)) {
+                /*
+                 * Only unnamed $DATA attributes can be compressed, encrypted,
+                 * and/or sparse.
+                 */
+                if (ni->type == AT_DATA && !ni->name_len) {
+                        /* If file is encrypted, deny access, just like NT4. */
+                        if (NInoEncrypted(ni)) {
+                                ntfs_debug("Denying write access to encrypted "
+                                                "file.");
+                                return -EACCES;
+                        }
+                        /* Compressed data streams are handled in compress.c. */
+                        if (NInoCompressed(ni)) {
+                                // TODO: Implement and replace this check with
+                                // return ntfs_write_compressed_block(page);
+                                ntfs_error(vi->i_sb, "Writing to compressed "
+                                                "files is not supported yet. "
+                                                "Sorry.");
+                                return -EOPNOTSUPP;
+                        }
+                        // TODO: Implement and remove this check.
+                        if (NInoSparse(ni)) {
+                                ntfs_error(vi->i_sb, "Writing to sparse files "
+                                                "is not supported yet. Sorry.");
+                                return -EOPNOTSUPP;
+                        }
+                }
+                /* Normal data stream. */
+                return ntfs_prepare_nonresident_write(page, from, to);
+        }
+        /*
+         * Attribute is resident, implying it is not compressed, encrypted, or
+         * sparse.
+         */
+        BUG_ON(page_has_buffers(page));
+        new_size = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
+        /* If we do not need to resize the attribute allocation we are done. */
+        if (new_size <= vi->i_size)
+                goto done;
+        // FIXME: We abort for now as this code is not safe.
+        ntfs_error(vi->i_sb, "Changing the file size is not supported yet.  "
+                        "Sorry.");
+        return -EOPNOTSUPP;
+        /* Map, pin, and lock the (base) mft record. */
+        if (!NInoAttr(ni))
+                base_ni = ni;
+        else
+                base_ni = ni->ext.base_ntfs_ino;
+        m = map_mft_record(base_ni);
+        if (IS_ERR(m)) {
+                err = PTR_ERR(m);
+                m = NULL;
+                ctx = NULL;
+                goto err_out;
+        }
+        ctx = ntfs_attr_get_search_ctx(base_ni, m);
+        if (unlikely(!ctx)) {
+                err = -ENOMEM;
+                goto err_out;
+        }
+        err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+                        CASE_SENSITIVE, 0, NULL, 0, ctx);
+        if (unlikely(err)) {
+                if (err == -ENOENT)
+                        err = -EIO;
+                goto err_out;
+        }
+        m = ctx->mrec;
+        a = ctx->attr;
+        /* The total length of the attribute value. */
+        attr_len = le32_to_cpu(a->data.resident.value_length);
+        BUG_ON(vi->i_size != attr_len);
+        /* Check if new size is allowed in $AttrDef. */
+        err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
+        if (unlikely(err)) {
+                if (err == -ERANGE) {
+                        ntfs_error(vol->sb, "Write would cause the inode "
+                                        "0x%lx to exceed the maximum size for "
+                                        "its attribute type (0x%x).  Aborting "
+                                        "write.", vi->i_ino,
+                                        le32_to_cpu(ni->type));
+                } else {
+                        ntfs_error(vol->sb, "Inode 0x%lx has unknown "
+                                        "attribute type 0x%x.  Aborting "
+                                        "write.", vi->i_ino,
+                                        le32_to_cpu(ni->type));
+                        err = -EIO;
+                }
+                goto err_out2;
+        }
+        /*
+         * Extend the attribute record to be able to store the new attribute
+         * size.
+         */
+        if (new_size >= vol->mft_record_size || ntfs_attr_record_resize(m, a,
+                        le16_to_cpu(a->data.resident.value_offset) +
+                        new_size)) {
+                /* Not enough space in the mft record. */
+                ntfs_error(vol->sb, "Not enough space in the mft record for "
+                                "the resized attribute value.  This is not "
+                                "supported yet.  Aborting write.");
+                err = -EOPNOTSUPP;
+                goto err_out2;
+        }
+        /*
+         * We have enough space in the mft record to fit the write.  This
+         * implies the attribute is smaller than the mft record and hence the
+         * attribute must be in a single page and hence page->index must be 0.
+         */
+        BUG_ON(page->index);
+        /*
+         * If the beginning of the write is past the old size, enlarge the
+         * attribute value up to the beginning of the write and fill it with
+         * zeroes.
+         */
+        if (from > attr_len) {
+                memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
+                                attr_len, 0, from - attr_len);
+                a->data.resident.value_length = cpu_to_le32(from);
+                /* Zero the corresponding area in the page as well. */
+                if (PageUptodate(page)) {
+                        kaddr = kmap_atomic(page, KM_USER0);
+                        memset(kaddr + attr_len, 0, from - attr_len);
+                        kunmap_atomic(kaddr, KM_USER0);
+                        flush_dcache_page(page);
+                }
+        }
+        flush_dcache_mft_record_page(ctx->ntfs_ino);
+        mark_mft_record_dirty(ctx->ntfs_ino);
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(base_ni);
+        /*
+         * Because resident attributes are handled by memcpy() to/from the
+         * corresponding MFT record, and because this form of i/o is byte
+         * aligned rather than block aligned, there is no need to bring the
+         * page uptodate here as in the non-resident case where we need to
+         * bring the buffers straddled by the write uptodate before
+         * generic_file_write() does the copying from userspace.
+         *
+         * We thus defer the uptodate bringing of the page region outside the
+         * region written to to ntfs_commit_write(), which makes the code
+         * simpler and saves one atomic kmap which is good.
+         */
+done:
+        ntfs_debug("Done.");
+        return 0;
+err_out:
+        if (err == -ENOMEM)
+                ntfs_warning(vi->i_sb, "Error allocating memory required to "
+                                "prepare the write.");
+        else {
+                ntfs_error(vi->i_sb, "Resident attribute prepare write failed "
+                                "with error %i.", err);
+                NVolSetErrors(vol);
+                make_bad_inode(vi);
+        }
+err_out2:
+        if (ctx)
+                ntfs_attr_put_search_ctx(ctx);
+        if (m)
+                unmap_mft_record(base_ni);
+        return err;
+}
+/**
+ * ntfs_commit_nonresident_write -
+ *
+ */
+static int ntfs_commit_nonresident_write(struct page *page,
+                unsigned from, unsigned to)
+{
+        s64 pos = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
+        struct inode *vi = page->mapping->host;
+        struct buffer_head *bh, *head;
+        unsigned int block_start, block_end, blocksize;
+        BOOL partial;
+        ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
+                        "0x%lx, from = %u, to = %u.", vi->i_ino,
+                        NTFS_I(vi)->type, page->index, from, to);
+        blocksize = 1 << vi->i_blkbits;
+        // FIXME: We need a whole slew of special cases in here for compressed
+        // files for example...
+        // For now, we know ntfs_prepare_write() would have failed so we can't
+        // get here in any of the cases which we have to special case, so we
+        // are just a ripped off, unrolled generic_commit_write().
+        bh = head = page_buffers(page);
+        block_start = 0;
+        partial = FALSE;
+        do {
+                block_end = block_start + blocksize;
+                if (block_end <= from || block_start >= to) {
+                        if (!buffer_uptodate(bh))
+                                partial = TRUE;
+                } else {
+                        set_buffer_uptodate(bh);
+                        mark_buffer_dirty(bh);
+                }
+        } while (block_start = block_end, (bh = bh->b_this_page) != head);
+        /*
+         * If this is a partial write which happened to make all buffers
+         * uptodate then we can optimize away a bogus ->readpage() for the next
+         * read().  Here we 'discover' whether the page went uptodate as a
+         * result of this (potentially partial) write.
+         */
+        if (!partial)
+                SetPageUptodate(page);
+        /*
+         * Not convinced about this at all.  See disparity comment above.  For
+         * now we know ntfs_prepare_write() would have failed in the write
+         * exceeds i_size case, so this will never trigger which is fine.
+         */
+        if (pos > vi->i_size) {
+                ntfs_error(vi->i_sb, "Writing beyond the existing file size is "
+                                "not supported yet.  Sorry.");
+                return -EOPNOTSUPP;
+                // vi->i_size = pos;
+                // mark_inode_dirty(vi);
+        }
+        ntfs_debug("Done.");
+        return 0;
+}
+/**
+ * ntfs_commit_write - commit the received data
+ *
+ * This is called from generic_file_write() with i_sem held on the inode
+ * (@page->mapping->host).  The @page is locked but not kmap()ped.  The source
+ * data has already been copied into the @page.  ntfs_prepare_write() has been
+ * called before the data copied and it returned success so we can take the
+ * results of various BUG checks and some error handling for granted.
+ *
+ * Need to mark modified blocks dirty so they get written out later when
+ * ntfs_writepage() is invoked by the VM.
+ *
+ * Return 0 on success or -errno on error.
+ *
+ * Should be using generic_commit_write().  This marks buffers uptodate and
+ * dirty, sets the page uptodate if all buffers in the page are uptodate, and
+ * updates i_size if the end of io is beyond i_size.  In that case, it also
+ * marks the inode dirty.
+ *
+ * Cannot use generic_commit_write() due to ntfs specialities but can look at
+ * it for implementation guidance.
+ *
+ * If things have gone as outlined in ntfs_prepare_write(), then we do not
+ * need to do any page content modifications here at all, except in the write
+ * to resident attribute case, where we need to do the uptodate bringing here
+ * which we combine with the copying into the mft record which means we save
+ * one atomic kmap.
+ */
+static int ntfs_commit_write(struct file *file, struct page *page,
+                unsigned from, unsigned to)
+{
+        struct inode *vi = page->mapping->host;
+        ntfs_inode *base_ni, *ni = NTFS_I(vi);
+        char *kaddr, *kattr;
+        ntfs_attr_search_ctx *ctx;
+        MFT_RECORD *m;
+        ATTR_RECORD *a;
+        u32 attr_len;
+        int err;
+        ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
+                        "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
+                        page->index, from, to);
+        /* If the attribute is not resident, deal with it elsewhere. */
+        if (NInoNonResident(ni)) {
+                /* Only unnamed $DATA attributes can be compressed/encrypted. */
+                if (ni->type == AT_DATA && !ni->name_len) {
+                        /* Encrypted files need separate handling. */
+                        if (NInoEncrypted(ni)) {
+                                // We never get here at present!
+                                BUG();
+                        }
+                        /* Compressed data streams are handled in compress.c. */
+                        if (NInoCompressed(ni)) {
+                                // TODO: Implement this!
+                                // return ntfs_write_compressed_block(page);
+                                // We never get here at present!
+                                BUG();
+                        }
+                }
+                /* Normal data stream. */
+                return ntfs_commit_nonresident_write(page, from, to);
+        }
+        /*
+         * Attribute is resident, implying it is not compressed, encrypted, or
+         * sparse.
+         */
+        if (!NInoAttr(ni))
+                base_ni = ni;
+        else
+                base_ni = ni->ext.base_ntfs_ino;
+        /* Map, pin, and lock the mft record. */
+        m = map_mft_record(base_ni);
+        if (IS_ERR(m)) {
+                err = PTR_ERR(m);
+                m = NULL;
+                ctx = NULL;
+                goto err_out;
+        }
+        ctx = ntfs_attr_get_search_ctx(base_ni, m);
+        if (unlikely(!ctx)) {
+                err = -ENOMEM;
+                goto err_out;
+        }
+        err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+                        CASE_SENSITIVE, 0, NULL, 0, ctx);
+        if (unlikely(err)) {
+                if (err == -ENOENT)
+                        err = -EIO;
+                goto err_out;
+        }
+        a = ctx->attr;
+        /* The total length of the attribute value. */
+        attr_len = le32_to_cpu(a->data.resident.value_length);
+        BUG_ON(from > attr_len);
+        kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
+        kaddr = kmap_atomic(page, KM_USER0);
+        /* Copy the received data from the page to the mft record. */
+        memcpy(kattr + from, kaddr + from, to - from);
+        /* Update the attribute length if necessary. */
+        if (to > attr_len) {
+                attr_len = to;
+                a->data.resident.value_length = cpu_to_le32(attr_len);
+        }
+        /*
+         * If the page is not uptodate, bring the out of bounds area(s)
+         * uptodate by copying data from the mft record to the page.
+         */
+        if (!PageUptodate(page)) {
+                if (from > 0)
+                        memcpy(kaddr, kattr, from);
+                if (to < attr_len)
+                        memcpy(kaddr + to, kattr + to, attr_len - to);
+                /* Zero the region outside the end of the attribute value. */
+                if (attr_len < PAGE_CACHE_SIZE)
+                        memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+                /*
+                 * The probability of not having done any of the above is
+                 * extremely small, so we just flush unconditionally.
+                 */
+                flush_dcache_page(page);
+                SetPageUptodate(page);
+        }
+        kunmap_atomic(kaddr, KM_USER0);
+        /* Update i_size if necessary. */
+        if (vi->i_size < attr_len) {
+                ni->allocated_size = ni->initialized_size = attr_len;
+                i_size_write(vi, attr_len);
+        }
+        /* Mark the mft record dirty, so it gets written back. */
+        flush_dcache_mft_record_page(ctx->ntfs_ino);
+        mark_mft_record_dirty(ctx->ntfs_ino);
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(base_ni);
+        ntfs_debug("Done.");
+        return 0;
+err_out:
+        if (err == -ENOMEM) {
+                ntfs_warning(vi->i_sb, "Error allocating memory required to "
+                                "commit the write.");
+                if (PageUptodate(page)) {
+                        ntfs_warning(vi->i_sb, "Page is uptodate, setting "
+                                        "dirty so the write will be retried "
+                                        "later on by the VM.");
+                        /*
+                         * Put the page on mapping->dirty_pages, but leave its
+                         * buffers' dirty state as-is.
+                         */
+                        __set_page_dirty_nobuffers(page);
+                        err = 0;
+                } else
+                        ntfs_error(vi->i_sb, "Page is not uptodate.  Written "
+                                        "data has been lost.");
+        } else {
+                ntfs_error(vi->i_sb, "Resident attribute commit write failed "
+                                "with error %i.", err);
+                NVolSetErrors(ni->vol);
+                make_bad_inode(vi);
+        }
+        if (ctx)
+                ntfs_attr_put_search_ctx(ctx);
+        if (m)
+                unmap_mft_record(base_ni);
+        return err;
+}
+#endif  /* NTFS_RW */
+/**
+ * ntfs_aops - general address space operations for inodes and attributes
+ */
+struct address_space_operations ntfs_aops = {
+        .readpage       = ntfs_readpage,        /* Fill page with data. */
+        .sync_page      = block_sync_page,      /* Currently, just unplugs the
+                                                   disk request queue. */
+#ifdef NTFS_RW
+        .writepage      = ntfs_writepage,       /* Write dirty page to disk. */
+        .prepare_write  = ntfs_prepare_write,   /* Prepare page and buffers
+                                                   ready to receive data. */
+        .commit_write   = ntfs_commit_write,    /* Commit received data. */
+#endif /* NTFS_RW */
+};
+/**
+ * ntfs_mst_aops - general address space operations for mst protecteed inodes
+ *                 and attributes
+ */
+struct address_space_operations ntfs_mst_aops = {
+        .readpage       = ntfs_readpage,        /* Fill page with data. */
+        .sync_page      = block_sync_page,      /* Currently, just unplugs the
+                                                   disk request queue. */
+#ifdef NTFS_RW
+        .writepage      = ntfs_writepage,       /* Write dirty page to disk. */
+        .set_page_dirty = __set_page_dirty_nobuffers,   /* Set the page dirty
+                                                   without touching the buffers
+                                                   belonging to the page. */
+#endif /* NTFS_RW */
+};
+#ifdef NTFS_RW
+/**
+ * mark_ntfs_record_dirty - mark an ntfs record dirty
+ * @page:       page containing the ntfs record to mark dirty
+ * @ofs:        byte offset within @page at which the ntfs record begins
+ *
+ * Set the buffers and the page in which the ntfs record is located dirty.
+ *
+ * The latter also marks the vfs inode the ntfs record belongs to dirty
+ * (I_DIRTY_PAGES only).
+ *
+ * If the page does not have buffers, we create them and set them uptodate.
+ * The page may not be locked which is why we need to handle the buffers under
+ * the mapping->private_lock.  Once the buffers are marked dirty we no longer
+ * need the lock since try_to_free_buffers() does not free dirty buffers.
+ */
+void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
+        struct address_space *mapping = page->mapping;
+        ntfs_inode *ni = NTFS_I(mapping->host);
+        struct buffer_head *bh, *head, *buffers_to_free = NULL;
+        unsigned int end, bh_size, bh_ofs;
+        BUG_ON(!PageUptodate(page));
+        end = ofs + ni->itype.index.block_size;
+        bh_size = 1 << VFS_I(ni)->i_blkbits;
+        spin_lock(&mapping->private_lock);
+        if (unlikely(!page_has_buffers(page))) {
+                spin_unlock(&mapping->private_lock);
+                bh = head = alloc_page_buffers(page, bh_size, 1);
+                spin_lock(&mapping->private_lock);
+                if (likely(!page_has_buffers(page))) {
+                        struct buffer_head *tail;
+                        do {
+                                set_buffer_uptodate(bh);
+                                tail = bh;
+                                bh = bh->b_this_page;
+                        } while (bh);
+                        tail->b_this_page = head;
+                        attach_page_buffers(page, head);
+                } else
+                        buffers_to_free = bh;
+        }
+        bh = head = page_buffers(page);
+        do {
+                bh_ofs = bh_offset(bh);
+                if (bh_ofs + bh_size <= ofs)
+                        continue;
+                if (unlikely(bh_ofs >= end))
+                        break;
+                set_buffer_dirty(bh);
+        } while ((bh = bh->b_this_page) != head);
+        spin_unlock(&mapping->private_lock);
+        __set_page_dirty_nobuffers(page);
+        if (unlikely(buffers_to_free)) {
+                do {
+                        bh = buffers_to_free->b_this_page;
+                        free_buffer_head(buffers_to_free);
+                        buffers_to_free = bh;
+                } while (buffers_to_free);
+        }
+}
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
new file mode 100644
index 000000000000..3b74e66ca2ff
--- /dev/null
+++ b/fs/ntfs/aops.h
@@ -0,0 +1,109 @@
+/**
+ * aops.h - Defines for NTFS kernel address space operations and page cache
+ *          handling.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_AOPS_H
+#define _LINUX_NTFS_AOPS_H
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/fs.h>
+#include "inode.h"
+/**
+ * ntfs_unmap_page - release a page that was mapped using ntfs_map_page()
+ * @page:       the page to release
+ *
+ * Unpin, unmap and release a page that was obtained from ntfs_map_page().
+ */
+static inline void ntfs_unmap_page(struct page *page)
+{
+        kunmap(page);
+        page_cache_release(page);
+}
+/**
+ * ntfs_map_page - map a page into accessible memory, reading it if necessary
+ * @mapping:    address space for which to obtain the page
+ * @index:      index into the page cache for @mapping of the page to map
+ *
+ * Read a page from the page cache of the address space @mapping at position
+ * @index, where @index is in units of PAGE_CACHE_SIZE, and not in bytes.
+ *
+ * If the page is not in memory it is loaded from disk first using the readpage
+ * method defined in the address space operations of @mapping and the page is
+ * added to the page cache of @mapping in the process.
+ *
+ * If the page belongs to an mst protected attribute and it is marked as such
+ * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no
+ * error checking is performed.  This means the caller has to verify whether
+ * the ntfs record(s) contained in the page are valid or not using one of the
+ * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are
+ * expecting to see.  (For details of the macros, see fs/ntfs/layout.h.)
+ *
+ * If the page is in high memory it is mapped into memory directly addressible
+ * by the kernel.
+ *
+ * Finally the page count is incremented, thus pinning the page into place.
+ *
+ * The above means that page_address(page) can be used on all pages obtained
+ * with ntfs_map_page() to get the kernel virtual address of the page.
+ *
+ * When finished with the page, the caller has to call ntfs_unmap_page() to
+ * unpin, unmap and release the page.
+ *
+ * Note this does not grant exclusive access. If such is desired, the caller
+ * must provide it independently of the ntfs_{un}map_page() calls by using
+ * a {rw_}semaphore or other means of serialization. A spin lock cannot be
+ * used as ntfs_map_page() can block.
+ *
+ * The unlocked and uptodate page is returned on success or an encoded error
+ * on failure. Caller has to test for error using the IS_ERR() macro on the
+ * return value. If that evaluates to TRUE, the negative error code can be
+ * obtained using PTR_ERR() on the return value of ntfs_map_page().
+ */
+static inline struct page *ntfs_map_page(struct address_space *mapping,
+                unsigned long index)
+{
+        struct page *page = read_cache_page(mapping, index,
+                        (filler_t*)mapping->a_ops->readpage, NULL);
+        if (!IS_ERR(page)) {
+                wait_on_page_locked(page);
+                kmap(page);
+                if (PageUptodate(page) && !PageError(page))
+                        return page;
+                ntfs_unmap_page(page);
+                return ERR_PTR(-EIO);
+        }
+        return page;
+}
+#ifdef NTFS_RW
+extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs);
+#endif /* NTFS_RW */
+#endif /* _LINUX_NTFS_AOPS_H */
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
new file mode 100644
index 000000000000..1ff7f90a18b0
--- /dev/null
+++ b/fs/ntfs/attrib.c
@@ -0,0 +1,1258 @@
+/**
+ * attrib.c - NTFS attribute operations.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/buffer_head.h>
+#include "attrib.h"
+#include "debug.h"
+#include "layout.h"
+#include "mft.h"
+#include "ntfs.h"
+#include "types.h"
+/**
+ * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode
+ * @ni:         ntfs inode for which to map (part of) a runlist
+ * @vcn:        map runlist part containing this vcn
+ *
+ * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - The runlist must be unlocked on entry and is unlocked on return.
+ *          - This function takes the lock for writing and modifies the runlist.
+ */
+int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
+{
+        ntfs_inode *base_ni;
+        ntfs_attr_search_ctx *ctx;
+        MFT_RECORD *mrec;
+        int err = 0;
+        ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
+                        (unsigned long long)vcn);
+        if (!NInoAttr(ni))
+                base_ni = ni;
+        else
+                base_ni = ni->ext.base_ntfs_ino;
+        mrec = map_mft_record(base_ni);
+        if (IS_ERR(mrec))
+                return PTR_ERR(mrec);
+        ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
+        if (unlikely(!ctx)) {
+                err = -ENOMEM;
+                goto err_out;
+        }
+        err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+                        CASE_SENSITIVE, vcn, NULL, 0, ctx);
+        if (unlikely(err))
+                goto put_err_out;
+        down_write(&ni->runlist.lock);
+        /* Make sure someone else didn't do the work while we were sleeping. */
+        if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <=
+                        LCN_RL_NOT_MAPPED)) {
+                runlist_element *rl;
+                rl = ntfs_mapping_pairs_decompress(ni->vol, ctx->attr,
+                                ni->runlist.rl);
+                if (IS_ERR(rl))
+                        err = PTR_ERR(rl);
+                else
+                        ni->runlist.rl = rl;
+        }
+        up_write(&ni->runlist.lock);
+put_err_out:
+        ntfs_attr_put_search_ctx(ctx);
+err_out:
+        unmap_mft_record(base_ni);
+        return err;
+}
+/**
+ * ntfs_find_vcn - find a vcn in the runlist described by an ntfs inode
+ * @ni:         ntfs inode describing the runlist to search
+ * @vcn:        vcn to find
+ * @need_write: if false, lock for reading and if true, lock for writing
+ *
+ * Find the virtual cluster number @vcn in the runlist described by the ntfs
+ * inode @ni and return the address of the runlist element containing the @vcn.
+ * The runlist is left locked and the caller has to unlock it.  If @need_write
+ * is true, the runlist is locked for writing and if @need_write is false, the
+ * runlist is locked for reading.  In the error case, the runlist is not left
+ * locked.
+ *
+ * Note you need to distinguish between the lcn of the returned runlist element
+ * being >= 0 and LCN_HOLE.  In the later case you have to return zeroes on
+ * read and allocate clusters on write.
+ *
+ * Return the runlist element containing the @vcn on success and
+ * ERR_PTR(-errno) on error.  You need to test the return value with IS_ERR()
+ * to decide if the return is success or failure and PTR_ERR() to get to the
+ * error code if IS_ERR() is true.
+ *
+ * The possible error return codes are:
+ *      -ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds.
+ *      -ENOMEM - Not enough memory to map runlist.
+ *      -EIO    - Critical error (runlist/file is corrupt, i/o error, etc).
+ *
+ * Locking: - The runlist must be unlocked on entry.
+ *          - On failing return, the runlist is unlocked.
+ *          - On successful return, the runlist is locked.  If @need_write us
+ *            true, it is locked for writing.  Otherwise is is locked for
+ *            reading.
+ */
+runlist_element *ntfs_find_vcn(ntfs_inode *ni, const VCN vcn,
+                const BOOL need_write)
+{
+        runlist_element *rl;
+        int err = 0;
+        BOOL is_retry = FALSE;
+        ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, lock for %sing.",
+                        ni->mft_no, (unsigned long long)vcn,
+                        !need_write ? "read" : "writ");
+        BUG_ON(!ni);
+        BUG_ON(!NInoNonResident(ni));
+        BUG_ON(vcn < 0);
+lock_retry_remap:
+        if (!need_write)
+                down_read(&ni->runlist.lock);
+        else
+                down_write(&ni->runlist.lock);
+        rl = ni->runlist.rl;
+        if (likely(rl && vcn >= rl[0].vcn)) {
+                while (likely(rl->length)) {
+                        if (likely(vcn < rl[1].vcn)) {
+                                if (likely(rl->lcn >= LCN_HOLE)) {
+                                        ntfs_debug("Done.");
+                                        return rl;
+                                }
+                                break;
+                        }
+                        rl++;
+                }
+                if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) {
+                        if (likely(rl->lcn == LCN_ENOENT))
+                                err = -ENOENT;
+                        else
+                                err = -EIO;
+                }
+        }
+        if (!need_write)
+                up_read(&ni->runlist.lock);
+        else
+                up_write(&ni->runlist.lock);
+        if (!err && !is_retry) {
+                /*
+                 * The @vcn is in an unmapped region, map the runlist and
+                 * retry.
+                 */
+                err = ntfs_map_runlist(ni, vcn);
+                if (likely(!err)) {
+                        is_retry = TRUE;
+                        goto lock_retry_remap;
+                }
+                /*
+                 * -EINVAL and -ENOENT coming from a failed mapping attempt are
+                 * equivalent to i/o errors for us as they should not happen in
+                 * our code paths.
+                 */
+                if (err == -EINVAL || err == -ENOENT)
+                        err = -EIO;
+        } else if (!err)
+                err = -EIO;
+        ntfs_error(ni->vol->sb, "Failed with error code %i.", err);
+        return ERR_PTR(err);
+}
+/**
+ * ntfs_attr_find - find (next) attribute in mft record
+ * @type:       attribute type to find
+ * @name:       attribute name to find (optional, i.e. NULL means don't care)
+ * @name_len:   attribute name length (only needed if @name present)
+ * @ic:         IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
+ * @val:        attribute value to find (optional, resident attributes only)
+ * @val_len:    attribute value length
+ * @ctx:        search context with mft record and attribute to search from
+ *
+ * You should not need to call this function directly.  Use ntfs_attr_lookup()
+ * instead.
+ *
+ * ntfs_attr_find() takes a search context @ctx as parameter and searches the
+ * mft record specified by @ctx->mrec, beginning at @ctx->attr, for an
+ * attribute of @type, optionally @name and @val.
+ *
+ * If the attribute is found, ntfs_attr_find() returns 0 and @ctx->attr will
+ * point to the found attribute.
+ *
+ * If the attribute is not found, ntfs_attr_find() returns -ENOENT and
+ * @ctx->attr will point to the attribute before which the attribute being
+ * searched for would need to be inserted if such an action were to be desired.
+ *
+ * On actual error, ntfs_attr_find() returns -EIO.  In this case @ctx->attr is
+ * undefined and in particular do not rely on it not changing.
+ *
+ * If @ctx->is_first is TRUE, the search begins with @ctx->attr itself.  If it
+ * is FALSE, the search begins after @ctx->attr.
+ *
+ * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and
+ * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record
+ * @ctx->mrec belongs.  This is so we can get at the ntfs volume and hence at
+ * the upcase table.  If @ic is CASE_SENSITIVE, the comparison is case
+ * sensitive.  When @name is present, @name_len is the @name length in Unicode
+ * characters.
+ *
+ * If @name is not present (NULL), we assume that the unnamed attribute is
+ * being searched for.
+ *
+ * Finally, the resident attribute value @val is looked for, if present.  If
+ * @val is not present (NULL), @val_len is ignored.
+ *
+ * ntfs_attr_find() only searches the specified mft record and it ignores the
+ * presence of an attribute list attribute (unless it is the one being searched
+ * for, obviously).  If you need to take attribute lists into consideration,
+ * use ntfs_attr_lookup() instead (see below).  This also means that you cannot
+ * use ntfs_attr_find() to search for extent records of non-resident
+ * attributes, as extents with lowest_vcn != 0 are usually described by the
+ * attribute list attribute only. - Note that it is possible that the first
+ * extent is only in the attribute list while the last extent is in the base
+ * mft record, so do not rely on being able to find the first extent in the
+ * base mft record.
+ *
+ * Warning: Never use @val when looking for attribute types which can be
+ *          non-resident as this most likely will result in a crash!
+ */
+static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
+                const u32 name_len, const IGNORE_CASE_BOOL ic,
+                const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
+{
+        ATTR_RECORD *a;
+        ntfs_volume *vol = ctx->ntfs_ino->vol;
+        ntfschar *upcase = vol->upcase;
+        u32 upcase_len = vol->upcase_len;
+        /*
+         * Iterate over attributes in mft record starting at @ctx->attr, or the
+         * attribute following that, if @ctx->is_first is TRUE.
+         */
+        if (ctx->is_first) {
+                a = ctx->attr;
+                ctx->is_first = FALSE;
+        } else
+                a = (ATTR_RECORD*)((u8*)ctx->attr +
+                                le32_to_cpu(ctx->attr->length));
+        for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
+                if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
+                                le32_to_cpu(ctx->mrec->bytes_allocated))
+                        break;
+                ctx->attr = a;
+                if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
+                                a->type == AT_END))
+                        return -ENOENT;
+                if (unlikely(!a->length))
+                        break;
+                if (a->type != type)
+                        continue;
+                /*
+                 * If @name is present, compare the two names.  If @name is
+                 * missing, assume we want an unnamed attribute.
+                 */
+                if (!name) {
+                        /* The search failed if the found attribute is named. */
+                        if (a->name_length)
+                                return -ENOENT;
+                } else if (!ntfs_are_names_equal(name, name_len,
+                            (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)),
+                            a->name_length, ic, upcase, upcase_len)) {
+                        register int rc;
+                        rc = ntfs_collate_names(name, name_len,
+                                        (ntfschar*)((u8*)a +
+                                        le16_to_cpu(a->name_offset)),
+                                        a->name_length, 1, IGNORE_CASE,
+                                        upcase, upcase_len);
+                        /*
+                         * If @name collates before a->name, there is no
+                         * matching attribute.
+                         */
+                        if (rc == -1)
+                                return -ENOENT;
+                        /* If the strings are not equal, continue search. */
+                        if (rc)
+                                continue;
+                        rc = ntfs_collate_names(name, name_len,
+                                        (ntfschar*)((u8*)a +
+                                        le16_to_cpu(a->name_offset)),
+                                        a->name_length, 1, CASE_SENSITIVE,
+                                        upcase, upcase_len);
+                        if (rc == -1)
+                                return -ENOENT;
+                        if (rc)
+                                continue;
+                }
+                /*
+                 * The names match or @name not present and attribute is
+                 * unnamed.  If no @val specified, we have found the attribute
+                 * and are done.
+                 */
+                if (!val)
+                        return 0;
+                /* @val is present; compare values. */
+                else {
+                        register int rc;
+                        rc = memcmp(val, (u8*)a + le16_to_cpu(
+                                        a->data.resident.value_offset),
+                                        min_t(u32, val_len, le32_to_cpu(
+                                        a->data.resident.value_length)));
+                        /*
+                         * If @val collates before the current attribute's
+                         * value, there is no matching attribute.
+                         */
+                        if (!rc) {
+                                register u32 avl;
+                                avl = le32_to_cpu(
+                                                a->data.resident.value_length);
+                                if (val_len == avl)
+                                        return 0;
+                                if (val_len < avl)
+                                        return -ENOENT;
+                        } else if (rc < 0)
+                                return -ENOENT;
+                }
+        }
+        ntfs_error(vol->sb, "Inode is corrupt.  Run chkdsk.");
+        NVolSetErrors(vol);
+        return -EIO;
+}
+/**
+ * load_attribute_list - load an attribute list into memory
+ * @vol:                ntfs volume from which to read
+ * @runlist:            runlist of the attribute list
+ * @al_start:           destination buffer
+ * @size:               size of the destination buffer in bytes
+ * @initialized_size:   initialized size of the attribute list
+ *
+ * Walk the runlist @runlist and load all clusters from it copying them into
+ * the linear buffer @al. The maximum number of bytes copied to @al is @size
+ * bytes. Note, @size does not need to be a multiple of the cluster size. If
+ * @initialized_size is less than @size, the region in @al between
+ * @initialized_size and @size will be zeroed and not read from disk.
+ *
+ * Return 0 on success or -errno on error.
+ */
+int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start,
+                const s64 size, const s64 initialized_size)
+{
+        LCN lcn;
+        u8 *al = al_start;
+        u8 *al_end = al + initialized_size;
+        runlist_element *rl;
+        struct buffer_head *bh;
+        struct super_block *sb;
+        unsigned long block_size;
+        unsigned long block, max_block;
+        int err = 0;
+        unsigned char block_size_bits;
+        ntfs_debug("Entering.");
+        if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 ||
+                        initialized_size > size)
+                return -EINVAL;
+        if (!initialized_size) {
+                memset(al, 0, size);
+                return 0;
+        }
+        sb = vol->sb;
+        block_size = sb->s_blocksize;
+        block_size_bits = sb->s_blocksize_bits;
+        down_read(&runlist->lock);
+        rl = runlist->rl;
+        /* Read all clusters specified by the runlist one run at a time. */
+        while (rl->length) {
+                lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn);
+                ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
+                                (unsigned long long)rl->vcn,
+                                (unsigned long long)lcn);
+                /* The attribute list cannot be sparse. */
+                if (lcn < 0) {
+                        ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed.  Cannot "
+                                        "read attribute list.");
+                        goto err_out;
+                }
+                block = lcn << vol->cluster_size_bits >> block_size_bits;
+                /* Read the run from device in chunks of block_size bytes. */
+                max_block = block + (rl->length << vol->cluster_size_bits >>
+                                block_size_bits);
+                ntfs_debug("max_block = 0x%lx.", max_block);
+                do {
+                        ntfs_debug("Reading block = 0x%lx.", block);
+                        bh = sb_bread(sb, block);
+                        if (!bh) {
+                                ntfs_error(sb, "sb_bread() failed. Cannot "
+                                                "read attribute list.");
+                                goto err_out;
+                        }
+                        if (al + block_size >= al_end)
+                                goto do_final;
+                        memcpy(al, bh->b_data, block_size);
+                        brelse(bh);
+                        al += block_size;
+                } while (++block < max_block);
+                rl++;
+        }
+        if (initialized_size < size) {
+initialize:
+                memset(al_start + initialized_size, 0, size - initialized_size);
+        }
+done:
+        up_read(&runlist->lock);
+        return err;
+do_final:
+        if (al < al_end) {
+                /*
+                 * Partial block.
+                 *
+                 * Note: The attribute list can be smaller than its allocation
+                 * by multiple clusters.  This has been encountered by at least
+                 * two people running Windows XP, thus we cannot do any
+                 * truncation sanity checking here. (AIA)
+                 */
+                memcpy(al, bh->b_data, al_end - al);
+                brelse(bh);
+                if (initialized_size < size)
+                        goto initialize;
+                goto done;
+        }
+        brelse(bh);
+        /* Real overflow! */
+        ntfs_error(sb, "Attribute list buffer overflow. Read attribute list "
+                        "is truncated.");
+err_out:
+        err = -EIO;
+        goto done;
+}
+/**
+ * ntfs_external_attr_find - find an attribute in the attribute list of an inode
+ * @type:       attribute type to find
+ * @name:       attribute name to find (optional, i.e. NULL means don't care)
+ * @name_len:   attribute name length (only needed if @name present)
+ * @ic:         IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
+ * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only)
+ * @val:        attribute value to find (optional, resident attributes only)
+ * @val_len:    attribute value length
+ * @ctx:        search context with mft record and attribute to search from
+ *
+ * You should not need to call this function directly.  Use ntfs_attr_lookup()
+ * instead.
+ *
+ * Find an attribute by searching the attribute list for the corresponding
+ * attribute list entry.  Having found the entry, map the mft record if the
+ * attribute is in a different mft record/inode, ntfs_attr_find() the attribute
+ * in there and return it.
+ *
+ * On first search @ctx->ntfs_ino must be the base mft record and @ctx must
+ * have been obtained from a call to ntfs_attr_get_search_ctx().  On subsequent
+ * calls @ctx->ntfs_ino can be any extent inode, too (@ctx->base_ntfs_ino is
+ * then the base inode).
+ *
+ * After finishing with the attribute/mft record you need to call
+ * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
+ * mapped inodes, etc).
+ *
+ * If the attribute is found, ntfs_external_attr_find() returns 0 and
+ * @ctx->attr will point to the found attribute.  @ctx->mrec will point to the
+ * mft record in which @ctx->attr is located and @ctx->al_entry will point to
+ * the attribute list entry for the attribute.
+ *
+ * If the attribute is not found, ntfs_external_attr_find() returns -ENOENT and
+ * @ctx->attr will point to the attribute in the base mft record before which
+ * the attribute being searched for would need to be inserted if such an action
+ * were to be desired.  @ctx->mrec will point to the mft record in which
+ * @ctx->attr is located and @ctx->al_entry will point to the attribute list
+ * entry of the attribute before which the attribute being searched for would
+ * need to be inserted if such an action were to be desired.
+ *
+ * Thus to insert the not found attribute, one wants to add the attribute to
+ * @ctx->mrec (the base mft record) and if there is not enough space, the
+ * attribute should be placed in a newly allocated extent mft record.  The
+ * attribute list entry for the inserted attribute should be inserted in the
+ * attribute list attribute at @ctx->al_entry.
+ *
+ * On actual error, ntfs_external_attr_find() returns -EIO.  In this case
+ * @ctx->attr is undefined and in particular do not rely on it not changing.
+ */
+static int ntfs_external_attr_find(const ATTR_TYPE type,
+                const ntfschar *name, const u32 name_len,
+                const IGNORE_CASE_BOOL ic, const VCN lowest_vcn,
+                const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
+{
+        ntfs_inode *base_ni, *ni;
+        ntfs_volume *vol;
+        ATTR_LIST_ENTRY *al_entry, *next_al_entry;
+        u8 *al_start, *al_end;
+        ATTR_RECORD *a;
+        ntfschar *al_name;
+        u32 al_name_len;
+        int err = 0;
+        static const char *es = " Unmount and run chkdsk.";
+        ni = ctx->ntfs_ino;
+        base_ni = ctx->base_ntfs_ino;
+        ntfs_debug("Entering for inode 0x%lx, type 0x%x.", ni->mft_no, type);
+        if (!base_ni) {
+                /* First call happens with the base mft record. */
+                base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino;
+                ctx->base_mrec = ctx->mrec;
+        }
+        if (ni == base_ni)
+                ctx->base_attr = ctx->attr;
+        if (type == AT_END)
+                goto not_found;
+        vol = base_ni->vol;
+        al_start = base_ni->attr_list;
+        al_end = al_start + base_ni->attr_list_size;
+        if (!ctx->al_entry)
+                ctx->al_entry = (ATTR_LIST_ENTRY*)al_start;
+        /*
+         * Iterate over entries in attribute list starting at @ctx->al_entry,
+         * or the entry following that, if @ctx->is_first is TRUE.
+         */
+        if (ctx->is_first) {
+                al_entry = ctx->al_entry;
+                ctx->is_first = FALSE;
+        } else
+                al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry +
+                                le16_to_cpu(ctx->al_entry->length));
+        for (;; al_entry = next_al_entry) {
+                /* Out of bounds check. */
+                if ((u8*)al_entry < base_ni->attr_list ||
+                                (u8*)al_entry > al_end)
+                        break;  /* Inode is corrupt. */
+                ctx->al_entry = al_entry;
+                /* Catch the end of the attribute list. */
+                if ((u8*)al_entry == al_end)
+                        goto not_found;
+                if (!al_entry->length)
+                        break;
+                if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
+                                le16_to_cpu(al_entry->length) > al_end)
+                        break;
+                next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
+                                le16_to_cpu(al_entry->length));
+                if (le32_to_cpu(al_entry->type) > le32_to_cpu(type))
+                        goto not_found;
+                if (type != al_entry->type)
+                        continue;
+                /*
+                 * If @name is present, compare the two names.  If @name is
+                 * missing, assume we want an unnamed attribute.
+                 */
+                al_name_len = al_entry->name_length;
+                al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset);
+                if (!name) {
+                        if (al_name_len)
+                                goto not_found;
+                } else if (!ntfs_are_names_equal(al_name, al_name_len, name,
+                                name_len, ic, vol->upcase, vol->upcase_len)) {
+                        register int rc;
+                        rc = ntfs_collate_names(name, name_len, al_name,
+                                        al_name_len, 1, IGNORE_CASE,
+                                        vol->upcase, vol->upcase_len);
+                        /*
+                         * If @name collates before al_name, there is no
+                         * matching attribute.
+                         */
+                        if (rc == -1)
+                                goto not_found;
+                        /* If the strings are not equal, continue search. */
+                        if (rc)
+                                continue;
+                        /*
+                         * FIXME: Reverse engineering showed 0, IGNORE_CASE but
+                         * that is inconsistent with ntfs_attr_find().  The
+                         * subsequent rc checks were also different.  Perhaps I
+                         * made a mistake in one of the two.  Need to recheck
+                         * which is correct or at least see what is going on...
+                         * (AIA)
+                         */
+                        rc = ntfs_collate_names(name, name_len, al_name,
+                                        al_name_len, 1, CASE_SENSITIVE,
+                                        vol->upcase, vol->upcase_len);
+                        if (rc == -1)
+                                goto not_found;
+                        if (rc)
+                                continue;
+                }
+                /*
+                 * The names match or @name not present and attribute is
+                 * unnamed.  Now check @lowest_vcn.  Continue search if the
+                 * next attribute list entry still fits @lowest_vcn.  Otherwise
+                 * we have reached the right one or the search has failed.
+                 */
+                if (lowest_vcn && (u8*)next_al_entry >= al_start            &&
+                                (u8*)next_al_entry + 6 < al_end             &&
+                                (u8*)next_al_entry + le16_to_cpu(
+                                        next_al_entry->length) <= al_end    &&
+                                sle64_to_cpu(next_al_entry->lowest_vcn) <=
+                                        lowest_vcn                          &&
+                                next_al_entry->type == al_entry->type       &&
+                                next_al_entry->name_length == al_name_len   &&
+                                ntfs_are_names_equal((ntfschar*)((u8*)
+                                        next_al_entry +
+                                        next_al_entry->name_offset),
+                                        next_al_entry->name_length,
+                                        al_name, al_name_len, CASE_SENSITIVE,
+                                        vol->upcase, vol->upcase_len))
+                        continue;
+                if (MREF_LE(al_entry->mft_reference) == ni->mft_no) {
+                        if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) {
+                                ntfs_error(vol->sb, "Found stale mft "
+                                                "reference in attribute list "
+                                                "of base inode 0x%lx.%s",
+                                                base_ni->mft_no, es);
+                                err = -EIO;
+                                break;
+                        }
+                } else { /* Mft references do not match. */
+                        /* If there is a mapped record unmap it first. */
+                        if (ni != base_ni)
+                                unmap_extent_mft_record(ni);
+                        /* Do we want the base record back? */
+                        if (MREF_LE(al_entry->mft_reference) ==
+                                        base_ni->mft_no) {
+                                ni = ctx->ntfs_ino = base_ni;
+                                ctx->mrec = ctx->base_mrec;
+                        } else {
+                                /* We want an extent record. */
+                                ctx->mrec = map_extent_mft_record(base_ni,
+                                                le64_to_cpu(
+                                                al_entry->mft_reference), &ni);
+                                if (IS_ERR(ctx->mrec)) {
+                                        ntfs_error(vol->sb, "Failed to map "
+                                                        "extent mft record "
+                                                        "0x%lx of base inode "
+                                                        "0x%lx.%s",
+                                                        MREF_LE(al_entry->
+                                                        mft_reference),
+                                                        base_ni->mft_no, es);
+                                        err = PTR_ERR(ctx->mrec);
+                                        if (err == -ENOENT)
+                                                err = -EIO;
+                                        /* Cause @ctx to be sanitized below. */
+                                        ni = NULL;
+                                        break;
+                                }
+                                ctx->ntfs_ino = ni;
+                        }
+                        ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
+                                        le16_to_cpu(ctx->mrec->attrs_offset));
+                }
+                /*
+                 * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the
+                 * mft record containing the attribute represented by the
+                 * current al_entry.
+                 */
+                /*
+                 * We could call into ntfs_attr_find() to find the right
+                 * attribute in this mft record but this would be less
+                 * efficient and not quite accurate as ntfs_attr_find() ignores
+                 * the attribute instance numbers for example which become
+                 * important when one plays with attribute lists.  Also,
+                 * because a proper match has been found in the attribute list
+                 * entry above, the comparison can now be optimized.  So it is
+                 * worth re-implementing a simplified ntfs_attr_find() here.
+                 */
+                a = ctx->attr;
+                /*
+                 * Use a manual loop so we can still use break and continue
+                 * with the same meanings as above.
+                 */
+do_next_attr_loop:
+                if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
+                                le32_to_cpu(ctx->mrec->bytes_allocated))
+                        break;
+                if (a->type == AT_END)
+                        continue;
+                if (!a->length)
+                        break;
+                if (al_entry->instance != a->instance)
+                        goto do_next_attr;
+                /*
+                 * If the type and/or the name are mismatched between the
+                 * attribute list entry and the attribute record, there is
+                 * corruption so we break and return error EIO.
+                 */
+                if (al_entry->type != a->type)
+                        break;
+                if (!ntfs_are_names_equal((ntfschar*)((u8*)a +
+                                le16_to_cpu(a->name_offset)), a->name_length,
+                                al_name, al_name_len, CASE_SENSITIVE,
+                                vol->upcase, vol->upcase_len))
+                        break;
+                ctx->attr = a;
+                /*
+                 * If no @val specified or @val specified and it matches, we
+                 * have found it!
+                 */
+                if (!val || (!a->non_resident && le32_to_cpu(
+                                a->data.resident.value_length) == val_len &&
+                                !memcmp((u8*)a +
+                                le16_to_cpu(a->data.resident.value_offset),
+                                val, val_len))) {
+                        ntfs_debug("Done, found.");
+                        return 0;
+                }
+do_next_attr:
+                /* Proceed to the next attribute in the current mft record. */
+                a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length));
+                goto do_next_attr_loop;
+        }
+        if (!err) {
+                ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt "
+                                "attribute list attribute.%s", base_ni->mft_no,
+                                es);
+                err = -EIO;
+        }
+        if (ni != base_ni) {
+                if (ni)
+                        unmap_extent_mft_record(ni);
+                ctx->ntfs_ino = base_ni;
+                ctx->mrec = ctx->base_mrec;
+                ctx->attr = ctx->base_attr;
+        }
+        if (err != -ENOMEM)
+                NVolSetErrors(vol);
+        return err;
+not_found:
+        /*
+         * If we were looking for AT_END, we reset the search context @ctx and
+         * use ntfs_attr_find() to seek to the end of the base mft record.
+         */
+        if (type == AT_END) {
+                ntfs_attr_reinit_search_ctx(ctx);
+                return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len,
+                                ctx);
+        }
+        /*
+         * The attribute was not found.  Before we return, we want to ensure
+         * @ctx->mrec and @ctx->attr indicate the position at which the
+         * attribute should be inserted in the base mft record.  Since we also
+         * want to preserve @ctx->al_entry we cannot reinitialize the search
+         * context using ntfs_attr_reinit_search_ctx() as this would set
+         * @ctx->al_entry to NULL.  Thus we do the necessary bits manually (see
+         * ntfs_attr_init_search_ctx() below).  Note, we _only_ preserve
+         * @ctx->al_entry as the remaining fields (base_*) are identical to
+         * their non base_ counterparts and we cannot set @ctx->base_attr
+         * correctly yet as we do not know what @ctx->attr will be set to by
+         * the call to ntfs_attr_find() below.
+         */
+        if (ni != base_ni)
+                unmap_extent_mft_record(ni);
+        ctx->mrec = ctx->base_mrec;
+        ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
+                        le16_to_cpu(ctx->mrec->attrs_offset));
+        ctx->is_first = TRUE;
+        ctx->ntfs_ino = base_ni;
+        ctx->base_ntfs_ino = NULL;
+        ctx->base_mrec = NULL;
+        ctx->base_attr = NULL;
+        /*
+         * In case there are multiple matches in the base mft record, need to
+         * keep enumerating until we get an attribute not found response (or
+         * another error), otherwise we would keep returning the same attribute
+         * over and over again and all programs using us for enumeration would
+         * lock up in a tight loop.
+         */
+        do {
+                err = ntfs_attr_find(type, name, name_len, ic, val, val_len,
+                                ctx);
+        } while (!err);
+        ntfs_debug("Done, not found.");
+        return err;
+}
+/**
+ * ntfs_attr_lookup - find an attribute in an ntfs inode
+ * @type:       attribute type to find
+ * @name:       attribute name to find (optional, i.e. NULL means don't care)
+ * @name_len:   attribute name length (only needed if @name present)
+ * @ic:         IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
+ * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only)
+ * @val:        attribute value to find (optional, resident attributes only)
+ * @val_len:    attribute value length
+ * @ctx:        search context with mft record and attribute to search from
+ *
+ * Find an attribute in an ntfs inode.  On first search @ctx->ntfs_ino must
+ * be the base mft record and @ctx must have been obtained from a call to
+ * ntfs_attr_get_search_ctx().
+ *
+ * This function transparently handles attribute lists and @ctx is used to
+ * continue searches where they were left off at.
+ *
+ * After finishing with the attribute/mft record you need to call
+ * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
+ * mapped inodes, etc).
+ *
+ * Return 0 if the search was successful and -errno if not.
+ *
+ * When 0, @ctx->attr is the found attribute and it is in mft record
+ * @ctx->mrec.  If an attribute list attribute is present, @ctx->al_entry is
+ * the attribute list entry of the found attribute.
+ *
+ * When -ENOENT, @ctx->attr is the attribute which collates just after the
+ * attribute being searched for, i.e. if one wants to add the attribute to the
+ * mft record this is the correct place to insert it into.  If an attribute
+ * list attribute is present, @ctx->al_entry is the attribute list entry which
+ * collates just after the attribute list entry of the attribute being searched
+ * for, i.e. if one wants to add the attribute to the mft record this is the
+ * correct place to insert its attribute list entry into.
+ *
+ * When -errno != -ENOENT, an error occured during the lookup.  @ctx->attr is
+ * then undefined and in particular you should not rely on it not changing.
+ */
+int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
+                const u32 name_len, const IGNORE_CASE_BOOL ic,
+                const VCN lowest_vcn, const u8 *val, const u32 val_len,
+                ntfs_attr_search_ctx *ctx)
+{
+        ntfs_inode *base_ni;
+        ntfs_debug("Entering.");
+        if (ctx->base_ntfs_ino)
+                base_ni = ctx->base_ntfs_ino;
+        else
+                base_ni = ctx->ntfs_ino;
+        /* Sanity check, just for debugging really. */
+        BUG_ON(!base_ni);
+        if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST)
+                return ntfs_attr_find(type, name, name_len, ic, val, val_len,
+                                ctx);
+        return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn,
+                        val, val_len, ctx);
+}
+/**
+ * ntfs_attr_init_search_ctx - initialize an attribute search context
+ * @ctx:        attribute search context to initialize
+ * @ni:         ntfs inode with which to initialize the search context
+ * @mrec:       mft record with which to initialize the search context
+ *
+ * Initialize the attribute search context @ctx with @ni and @mrec.
+ */
+static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx,
+                ntfs_inode *ni, MFT_RECORD *mrec)
+{
+        ctx->mrec = mrec;
+        /* Sanity checks are performed elsewhere. */
+        ctx->attr = (ATTR_RECORD*)((u8*)mrec + le16_to_cpu(mrec->attrs_offset));
+        ctx->is_first = TRUE;
+        ctx->ntfs_ino = ni;
+        ctx->al_entry = NULL;
+        ctx->base_ntfs_ino = NULL;
+        ctx->base_mrec = NULL;
+        ctx->base_attr = NULL;
+}
+/**
+ * ntfs_attr_reinit_search_ctx - reinitialize an attribute search context
+ * @ctx:        attribute search context to reinitialize
+ *
+ * Reinitialize the attribute search context @ctx, unmapping an associated
+ * extent mft record if present, and initialize the search context again.
+ *
+ * This is used when a search for a new attribute is being started to reset
+ * the search context to the beginning.
+ */
+void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx)
+{
+        if (likely(!ctx->base_ntfs_ino)) {
+                /* No attribute list. */
+                ctx->is_first = TRUE;
+                /* Sanity checks are performed elsewhere. */
+                ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
+                                le16_to_cpu(ctx->mrec->attrs_offset));
+                /*
+                 * This needs resetting due to ntfs_external_attr_find() which
+                 * can leave it set despite having zeroed ctx->base_ntfs_ino.
+                 */
+                ctx->al_entry = NULL;
+                return;
+        } /* Attribute list. */
+        if (ctx->ntfs_ino != ctx->base_ntfs_ino)
+                unmap_extent_mft_record(ctx->ntfs_ino);
+        ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec);
+        return;
+}
+/**
+ * ntfs_attr_get_search_ctx - allocate/initialize a new attribute search context
+ * @ni:         ntfs inode with which to initialize the search context
+ * @mrec:       mft record with which to initialize the search context
+ *
+ * Allocate a new attribute search context, initialize it with @ni and @mrec,
+ * and return it. Return NULL if allocation failed.
+ */
+ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec)
+{
+        ntfs_attr_search_ctx *ctx;
+        ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, SLAB_NOFS);
+        if (ctx)
+                ntfs_attr_init_search_ctx(ctx, ni, mrec);
+        return ctx;
+}
+/**
+ * ntfs_attr_put_search_ctx - release an attribute search context
+ * @ctx:        attribute search context to free
+ *
+ * Release the attribute search context @ctx, unmapping an associated extent
+ * mft record if present.
+ */
+void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx)
+{
+        if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino)
+                unmap_extent_mft_record(ctx->ntfs_ino);
+        kmem_cache_free(ntfs_attr_ctx_cache, ctx);
+        return;
+}
+/**
+ * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file
+ * @vol:        ntfs volume to which the attribute belongs
+ * @type:       attribute type which to find
+ *
+ * Search for the attribute definition record corresponding to the attribute
+ * @type in the $AttrDef system file.
+ *
+ * Return the attribute type definition record if found and NULL if not found.
+ */
+static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol,
+                const ATTR_TYPE type)
+{
+        ATTR_DEF *ad;
+        BUG_ON(!vol->attrdef);
+        BUG_ON(!type);
+        for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef <
+                        vol->attrdef_size && ad->type; ++ad) {
+                /* We have not found it yet, carry on searching. */
+                if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type)))
+                        continue;
+                /* We found the attribute; return it. */
+                if (likely(ad->type == type))
+                        return ad;
+                /* We have gone too far already.  No point in continuing. */
+                break;
+        }
+        /* Attribute not found. */
+        ntfs_debug("Attribute type 0x%x not found in $AttrDef.",
+                        le32_to_cpu(type));
+        return NULL;
+}
+/**
+ * ntfs_attr_size_bounds_check - check a size of an attribute type for validity
+ * @vol:        ntfs volume to which the attribute belongs
+ * @type:       attribute type which to check
+ * @size:       size which to check
+ *
+ * Check whether the @size in bytes is valid for an attribute of @type on the
+ * ntfs volume @vol.  This information is obtained from $AttrDef system file.
+ *
+ * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not
+ * listed in $AttrDef.
+ */
+int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type,
+                const s64 size)
+{
+        ATTR_DEF *ad;
+        BUG_ON(size < 0);
+        /*
+         * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not
+         * listed in $AttrDef.
+         */
+        if (unlikely(type == AT_ATTRIBUTE_LIST && size > 256 * 1024))
+                return -ERANGE;
+        /* Get the $AttrDef entry for the attribute @type. */
+        ad = ntfs_attr_find_in_attrdef(vol, type);
+        if (unlikely(!ad))
+                return -ENOENT;
+        /* Do the bounds check. */
+        if (((sle64_to_cpu(ad->min_size) > 0) &&
+                        size < sle64_to_cpu(ad->min_size)) ||
+                        ((sle64_to_cpu(ad->max_size) > 0) && size >
+                        sle64_to_cpu(ad->max_size)))
+                return -ERANGE;
+        return 0;
+}
+/**
+ * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident
+ * @vol:        ntfs volume to which the attribute belongs
+ * @type:       attribute type which to check
+ *
+ * Check whether the attribute of @type on the ntfs volume @vol is allowed to
+ * be non-resident.  This information is obtained from $AttrDef system file.
+ *
+ * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, or
+ * -ENOENT if the attribute is not listed in $AttrDef.
+ */
+int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type)
+{
+        ATTR_DEF *ad;
+        /*
+         * $DATA is always allowed to be non-resident even if $AttrDef does not
+         * specify this in the flags of the $DATA attribute definition record.
+         */
+        if (type == AT_DATA)
+                return 0;
+        /* Find the attribute definition record in $AttrDef. */
+        ad = ntfs_attr_find_in_attrdef(vol, type);
+        if (unlikely(!ad))
+                return -ENOENT;
+        /* Check the flags and return the result. */
+        if (ad->flags & CAN_BE_NON_RESIDENT)
+                return 0;
+        return -EPERM;
+}
+/**
+ * ntfs_attr_can_be_resident - check if an attribute can be resident
+ * @vol:        ntfs volume to which the attribute belongs
+ * @type:       attribute type which to check
+ *
+ * Check whether the attribute of @type on the ntfs volume @vol is allowed to
+ * be resident.  This information is derived from our ntfs knowledge and may
+ * not be completely accurate, especially when user defined attributes are
+ * present.  Basically we allow everything to be resident except for index
+ * allocation and $EA attributes.
+ *
+ * Return 0 if the attribute is allowed to be non-resident and -EPERM if not.
+ *
+ * Warning: In the system file $MFT the attribute $Bitmap must be non-resident
+ *          otherwise windows will not boot (blue screen of death)!  We cannot
+ *          check for this here as we do not know which inode's $Bitmap is
+ *          being asked about so the caller needs to special case this.
+ */
+int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type)
+{
+        if (type != AT_INDEX_ALLOCATION && type != AT_EA)
+                return 0;
+        return -EPERM;
+}
+/**
+ * ntfs_attr_record_resize - resize an attribute record
+ * @m:          mft record containing attribute record
+ * @a:          attribute record to resize
+ * @new_size:   new size in bytes to which to resize the attribute record @a
+ *
+ * Resize the attribute record @a, i.e. the resident part of the attribute, in
+ * the mft record @m to @new_size bytes.
+ *
+ * Return 0 on success and -errno on error.  The following error codes are
+ * defined:
+ *      -ENOSPC - Not enough space in the mft record @m to perform the resize.
+ *
+ * Note: On error, no modifications have been performed whatsoever.
+ *
+ * Warning: If you make a record smaller without having copied all the data you
+ *          are interested in the data may be overwritten.
+ */
+int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size)
+{
+        ntfs_debug("Entering for new_size %u.", new_size);
+        /* Align to 8 bytes if it is not already done. */
+        if (new_size & 7)
+                new_size = (new_size + 7) & ~7;
+        /* If the actual attribute length has changed, move things around. */
+        if (new_size != le32_to_cpu(a->length)) {
+                u32 new_muse = le32_to_cpu(m->bytes_in_use) -
+                                le32_to_cpu(a->length) + new_size;
+                /* Not enough space in this mft record. */
+                if (new_muse > le32_to_cpu(m->bytes_allocated))
+                        return -ENOSPC;
+                /* Move attributes following @a to their new location. */
+                memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length),
+                                le32_to_cpu(m->bytes_in_use) - ((u8*)a -
+                                (u8*)m) - le32_to_cpu(a->length));
+                /* Adjust @m to reflect the change in used space. */
+                m->bytes_in_use = cpu_to_le32(new_muse);
+                /* Adjust @a to reflect the new size. */
+                if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length))
+                        a->length = cpu_to_le32(new_size);
+        }
+        return 0;
+}
+/**
+ * ntfs_attr_set - fill (a part of) an attribute with a byte
+ * @ni:         ntfs inode describing the attribute to fill
+ * @ofs:        offset inside the attribute at which to start to fill
+ * @cnt:        number of bytes to fill
+ * @val:        the unsigned 8-bit value with which to fill the attribute
+ *
+ * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at
+ * byte offset @ofs inside the attribute with the constant byte @val.
+ *
+ * This function is effectively like memset() applied to an ntfs attribute.
+ *
+ * Return 0 on success and -errno on error.  An error code of -ESPIPE means
+ * that @ofs + @cnt were outside the end of the attribute and no write was
+ * performed.
+ */
+int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
+{
+        ntfs_volume *vol = ni->vol;
+        struct address_space *mapping;
+        struct page *page;
+        u8 *kaddr;
+        pgoff_t idx, end;
+        unsigned int start_ofs, end_ofs, size;
+        ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.",
+                        (long long)ofs, (long long)cnt, val);
+        BUG_ON(ofs < 0);
+        BUG_ON(cnt < 0);
+        if (!cnt)
+                goto done;
+        mapping = VFS_I(ni)->i_mapping;
+        /* Work out the starting index and page offset. */
+        idx = ofs >> PAGE_CACHE_SHIFT;
+        start_ofs = ofs & ~PAGE_CACHE_MASK;
+        /* Work out the ending index and page offset. */
+        end = ofs + cnt;
+        end_ofs = end & ~PAGE_CACHE_MASK;
+        /* If the end is outside the inode size return -ESPIPE. */
+        if (unlikely(end > VFS_I(ni)->i_size)) {
+                ntfs_error(vol->sb, "Request exceeds end of attribute.");
+                return -ESPIPE;
+        }
+        end >>= PAGE_CACHE_SHIFT;
+        /* If there is a first partial page, need to do it the slow way. */
+        if (start_ofs) {
+                page = read_cache_page(mapping, idx,
+                                (filler_t*)mapping->a_ops->readpage, NULL);
+                if (IS_ERR(page)) {
+                        ntfs_error(vol->sb, "Failed to read first partial "
+                                        "page (sync error, index 0x%lx).", idx);
+                        return PTR_ERR(page);
+                }
+                wait_on_page_locked(page);
+                if (unlikely(!PageUptodate(page))) {
+                        ntfs_error(vol->sb, "Failed to read first partial page "
+                                        "(async error, index 0x%lx).", idx);
+                        page_cache_release(page);
+                        return PTR_ERR(page);
+                }
+                /*
+                 * If the last page is the same as the first page, need to
+                 * limit the write to the end offset.
+                 */
+                size = PAGE_CACHE_SIZE;
+                if (idx == end)
+                        size = end_ofs;
+                kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr + start_ofs, val, size - start_ofs);
+                flush_dcache_page(page);
+                kunmap_atomic(kaddr, KM_USER0);
+                set_page_dirty(page);
+                page_cache_release(page);
+                if (idx == end)
+                        goto done;
+                idx++;
+        }
+        /* Do the whole pages the fast way. */
+        for (; idx < end; idx++) {
+                /* Find or create the current page.  (The page is locked.) */
+                page = grab_cache_page(mapping, idx);
+                if (unlikely(!page)) {
+                        ntfs_error(vol->sb, "Insufficient memory to grab "
+                                        "page (index 0x%lx).", idx);
+                        return -ENOMEM;
+                }
+                kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr, val, PAGE_CACHE_SIZE);
+                flush_dcache_page(page);
+                kunmap_atomic(kaddr, KM_USER0);
+                /*
+                 * If the page has buffers, mark them uptodate since buffer
+                 * state and not page state is definitive in 2.6 kernels.
+                 */
+                if (page_has_buffers(page)) {
+                        struct buffer_head *bh, *head;
+                        bh = head = page_buffers(page);
+                        do {
+                                set_buffer_uptodate(bh);
+                        } while ((bh = bh->b_this_page) != head);
+                }
+                /* Now that buffers are uptodate, set the page uptodate, too. */
+                SetPageUptodate(page);
+                /*
+                 * Set the page and all its buffers dirty and mark the inode
+                 * dirty, too.  The VM will write the page later on.
+                 */
+                set_page_dirty(page);
+                /* Finally unlock and release the page. */
+                unlock_page(page);
+                page_cache_release(page);
+        }
+        /* If there is a last partial page, need to do it the slow way. */
+        if (end_ofs) {
+                page = read_cache_page(mapping, idx,
+                                (filler_t*)mapping->a_ops->readpage, NULL);
+                if (IS_ERR(page)) {
+                        ntfs_error(vol->sb, "Failed to read last partial page "
+                                        "(sync error, index 0x%lx).", idx);
+                        return PTR_ERR(page);
+                }
+                wait_on_page_locked(page);
+                if (unlikely(!PageUptodate(page))) {
+                        ntfs_error(vol->sb, "Failed to read last partial page "
+                                        "(async error, index 0x%lx).", idx);
+                        page_cache_release(page);
+                        return PTR_ERR(page);
+                }
+                kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr, val, end_ofs);
+                flush_dcache_page(page);
+                kunmap_atomic(kaddr, KM_USER0);
+                set_page_dirty(page);
+                page_cache_release(page);
+        }
+done:
+        ntfs_debug("Done.");
+        return 0;
+}
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
new file mode 100644
index 000000000000..e0c2c6c81bc0
--- /dev/null
+++ b/fs/ntfs/attrib.h
@@ -0,0 +1,100 @@
+/*
+ * attrib.h - Defines for attribute handling in NTFS Linux kernel driver.
+ *            Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_ATTRIB_H
+#define _LINUX_NTFS_ATTRIB_H
+#include "endian.h"
+#include "types.h"
+#include "layout.h"
+#include "inode.h"
+#include "runlist.h"
+#include "volume.h"
+/**
+ * ntfs_attr_search_ctx - used in attribute search functions
+ * @mrec:       buffer containing mft record to search
+ * @attr:       attribute record in @mrec where to begin/continue search
+ * @is_first:   if true ntfs_attr_lookup() begins search with @attr, else after
+ *
+ * Structure must be initialized to zero before the first call to one of the
+ * attribute search functions. Initialize @mrec to point to the mft record to
+ * search, and @attr to point to the first attribute within @mrec (not necessary
+ * if calling the _first() functions), and set @is_first to TRUE (not necessary
+ * if calling the _first() functions).
+ *
+ * If @is_first is TRUE, the search begins with @attr. If @is_first is FALSE,
+ * the search begins after @attr. This is so that, after the first call to one
+ * of the search attribute functions, we can call the function again, without
+ * any modification of the search context, to automagically get the next
+ * matching attribute.
+ */
+typedef struct {
+        MFT_RECORD *mrec;
+        ATTR_RECORD *attr;
+        BOOL is_first;
+        ntfs_inode *ntfs_ino;
+        ATTR_LIST_ENTRY *al_entry;
+        ntfs_inode *base_ntfs_ino;
+        MFT_RECORD *base_mrec;
+        ATTR_RECORD *base_attr;
+} ntfs_attr_search_ctx;
+extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
+extern runlist_element *ntfs_find_vcn(ntfs_inode *ni, const VCN vcn,
+                const BOOL need_write);
+int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
+                const u32 name_len, const IGNORE_CASE_BOOL ic,
+                const VCN lowest_vcn, const u8 *val, const u32 val_len,
+                ntfs_attr_search_ctx *ctx);
+extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start,
+                const s64 size, const s64 initialized_size);
+static inline s64 ntfs_attr_size(const ATTR_RECORD *a)
+{
+        if (!a->non_resident)
+                return (s64)le32_to_cpu(a->data.resident.value_length);
+        return sle64_to_cpu(a->data.non_resident.data_size);
+}
+extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx);
+extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni,
+                MFT_RECORD *mrec);
+extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx);
+extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol,
+                const ATTR_TYPE type, const s64 size);
+extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol,
+                const ATTR_TYPE type);
+extern int ntfs_attr_can_be_resident(const ntfs_volume *vol,
+                const ATTR_TYPE type);
+extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
+extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt,
+                const u8 val);
+#endif /* _LINUX_NTFS_ATTRIB_H */
diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
new file mode 100644
index 000000000000..12cf2e30c7dd
--- /dev/null
+++ b/fs/ntfs/bitmap.c
@@ -0,0 +1,192 @@
+/*
+ * bitmap.c - NTFS kernel bitmap handling.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifdef NTFS_RW
+#include <linux/pagemap.h>
+#include "bitmap.h"
+#include "debug.h"
+#include "aops.h"
+#include "ntfs.h"
+/**
+ * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
+ * @vi:                 vfs inode describing the bitmap
+ * @start_bit:          first bit to set
+ * @count:              number of bits to set
+ * @value:              value to set the bits to (i.e. 0 or 1)
+ * @is_rollback:        if TRUE this is a rollback operation
+ *
+ * Set @count bits starting at bit @start_bit in the bitmap described by the
+ * vfs inode @vi to @value, where @value is either 0 or 1.
+ *
+ * @is_rollback should always be FALSE, it is for internal use to rollback
+ * errors.  You probably want to use ntfs_bitmap_set_bits_in_run() instead.
+ *
+ * Return 0 on success and -errno on error.
+ */
+int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
+                const s64 count, const u8 value, const BOOL is_rollback)
+{
+        s64 cnt = count;
+        pgoff_t index, end_index;
+        struct address_space *mapping;
+        struct page *page;
+        u8 *kaddr;
+        int pos, len;
+        u8 bit;
+        BUG_ON(!vi);
+        ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, "
+                        "value %u.%s", vi->i_ino, (unsigned long long)start_bit,
+                        (unsigned long long)cnt, (unsigned int)value,
+                        is_rollback ? " (rollback)" : "");
+        BUG_ON(start_bit < 0);
+        BUG_ON(cnt < 0);
+        BUG_ON(value > 1);
+        /*
+         * Calculate the indices for the pages containing the first and last
+         * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively.
+         */
+        index = start_bit >> (3 + PAGE_CACHE_SHIFT);
+        end_index = (start_bit + cnt - 1) >> (3 + PAGE_CACHE_SHIFT);
+        /* Get the page containing the first bit (@start_bit). */
+        mapping = vi->i_mapping;
+        page = ntfs_map_page(mapping, index);
+        if (IS_ERR(page)) {
+                if (!is_rollback)
+                        ntfs_error(vi->i_sb, "Failed to map first page (error "
+                                        "%li), aborting.", PTR_ERR(page));
+                return PTR_ERR(page);
+        }
+        kaddr = page_address(page);
+        /* Set @pos to the position of the byte containing @start_bit. */
+        pos = (start_bit >> 3) & ~PAGE_CACHE_MASK;
+        /* Calculate the position of @start_bit in the first byte. */
+        bit = start_bit & 7;
+        /* If the first byte is partial, modify the appropriate bits in it. */
+        if (bit) {
+                u8 *byte = kaddr + pos;
+                while ((bit & 7) && cnt--) {
+                        if (value)
+                                *byte |= 1 << bit++;
+                        else
+                                *byte &= ~(1 << bit++);
+                }
+                /* If we are done, unmap the page and return success. */
+                if (!cnt)
+                        goto done;
+                /* Update @pos to the new position. */
+                pos++;
+        }
+        /*
+         * Depending on @value, modify all remaining whole bytes in the page up
+         * to @cnt.
+         */
+        len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE - pos);
+        memset(kaddr + pos, value ? 0xff : 0, len);
+        cnt -= len << 3;
+        /* Update @len to point to the first not-done byte in the page. */
+        if (cnt < 8)
+                len += pos;
+        /* If we are not in the last page, deal with all subsequent pages. */
+        while (index < end_index) {
+                BUG_ON(cnt <= 0);
+                /* Update @index and get the next page. */
+                flush_dcache_page(page);
+                set_page_dirty(page);
+                ntfs_unmap_page(page);
+                page = ntfs_map_page(mapping, ++index);
+                if (IS_ERR(page))
+                        goto rollback;
+                kaddr = page_address(page);
+                /*
+                 * Depending on @value, modify all remaining whole bytes in the
+                 * page up to @cnt.
+                 */
+                len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE);
+                memset(kaddr, value ? 0xff : 0, len);
+                cnt -= len << 3;
+        }
+        /*
+         * The currently mapped page is the last one.  If the last byte is
+         * partial, modify the appropriate bits in it.  Note, @len is the
+         * position of the last byte inside the page.
+         */
+        if (cnt) {
+                u8 *byte;
+                BUG_ON(cnt > 7);
+                bit = cnt;
+                byte = kaddr + len;
+                while (bit--) {
+                        if (value)
+                                *byte |= 1 << bit;
+                        else
+                                *byte &= ~(1 << bit);
+                }
+        }
+done:
+        /* We are done.  Unmap the page and return success. */
+        flush_dcache_page(page);
+        set_page_dirty(page);
+        ntfs_unmap_page(page);
+        ntfs_debug("Done.");
+        return 0;
+rollback:
+        /*
+         * Current state:
+         *      - no pages are mapped
+         *      - @count - @cnt is the number of bits that have been modified
+         */
+        if (is_rollback)
+                return PTR_ERR(page);
+        if (count != cnt)
+                pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt,
+                                value ? 0 : 1, TRUE);
+        else
+                pos = 0;
+        if (!pos) {
+                /* Rollback was successful. */
+                ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
+                                "%li), aborting.", PTR_ERR(page));
+        } else {
+                /* Rollback failed. */
+                ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
+                                "%li) and rollback failed (error %i).  "
+                                "Aborting and leaving inconsistent metadata.  "
+                                "Unmount and run chkdsk.", PTR_ERR(page), pos);
+                NVolSetErrors(NTFS_SB(vi->i_sb));
+        }
+        return PTR_ERR(page);
+}
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h
new file mode 100644
index 000000000000..bb50d6bc9212
--- /dev/null
+++ b/fs/ntfs/bitmap.h
@@ -0,0 +1,118 @@
+/*
+ * bitmap.h - Defines for NTFS kernel bitmap handling.  Part of the Linux-NTFS
+ *            project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_BITMAP_H
+#define _LINUX_NTFS_BITMAP_H
+#ifdef NTFS_RW
+#include <linux/fs.h>
+#include "types.h"
+extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
+                const s64 count, const u8 value, const BOOL is_rollback);
+/**
+ * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
+ * @vi:                 vfs inode describing the bitmap
+ * @start_bit:          first bit to set
+ * @count:              number of bits to set
+ * @value:              value to set the bits to (i.e. 0 or 1)
+ *
+ * Set @count bits starting at bit @start_bit in the bitmap described by the
+ * vfs inode @vi to @value, where @value is either 0 or 1.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi,
+                const s64 start_bit, const s64 count, const u8 value)
+{
+        return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value,
+                        FALSE);
+}
+/**
+ * ntfs_bitmap_set_run - set a run of bits in a bitmap
+ * @vi:         vfs inode describing the bitmap
+ * @start_bit:  first bit to set
+ * @count:      number of bits to set
+ *
+ * Set @count bits starting at bit @start_bit in the bitmap described by the
+ * vfs inode @vi.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit,
+                const s64 count)
+{
+        return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 1);
+}
+/**
+ * ntfs_bitmap_clear_run - clear a run of bits in a bitmap
+ * @vi:         vfs inode describing the bitmap
+ * @start_bit:  first bit to clear
+ * @count:      number of bits to clear
+ *
+ * Clear @count bits starting at bit @start_bit in the bitmap described by the
+ * vfs inode @vi.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit,
+                const s64 count)
+{
+        return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 0);
+}
+/**
+ * ntfs_bitmap_set_bit - set a bit in a bitmap
+ * @vi:         vfs inode describing the bitmap
+ * @bit:        bit to set
+ *
+ * Set bit @bit in the bitmap described by the vfs inode @vi.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit)
+{
+        return ntfs_bitmap_set_run(vi, bit, 1);
+}
+/**
+ * ntfs_bitmap_clear_bit - clear a bit in a bitmap
+ * @vi:         vfs inode describing the bitmap
+ * @bit:        bit to clear
+ *
+ * Clear bit @bit in the bitmap described by the vfs inode @vi.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit)
+{
+        return ntfs_bitmap_clear_run(vi, bit, 1);
+}
+#endif /* NTFS_RW */
+#endif /* defined _LINUX_NTFS_BITMAP_H */
diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c
new file mode 100644
index 000000000000..4a28ab3898ef
--- /dev/null
+++ b/fs/ntfs/collate.c
@@ -0,0 +1,124 @@
+/*
+ * collate.c - NTFS kernel collation handling.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include "collate.h"
+#include "debug.h"
+#include "ntfs.h"
+static int ntfs_collate_binary(ntfs_volume *vol,
+                const void *data1, const int data1_len,
+                const void *data2, const int data2_len)
+{
+        int rc;
+        ntfs_debug("Entering.");
+        rc = memcmp(data1, data2, min(data1_len, data2_len));
+        if (!rc && (data1_len != data2_len)) {
+                if (data1_len < data2_len)
+                        rc = -1;
+                else
+                        rc = 1;
+        }
+        ntfs_debug("Done, returning %i", rc);
+        return rc;
+}
+static int ntfs_collate_ntofs_ulong(ntfs_volume *vol,
+                const void *data1, const int data1_len,
+                const void *data2, const int data2_len)
+{
+        int rc;
+        u32 d1, d2;
+        ntfs_debug("Entering.");
+        // FIXME:  We don't really want to bug here.
+        BUG_ON(data1_len != data2_len);
+        BUG_ON(data1_len != 4);
+        d1 = le32_to_cpup(data1);
+        d2 = le32_to_cpup(data2);
+        if (d1 < d2)
+                rc = -1;
+        else {
+                if (d1 == d2)
+                        rc = 0;
+                else
+                        rc = 1;
+        }
+        ntfs_debug("Done, returning %i", rc);
+        return rc;
+}
+typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int,
+                const void *, const int);
+static ntfs_collate_func_t ntfs_do_collate0x0[3] = {
+        ntfs_collate_binary,
+        NULL/*ntfs_collate_file_name*/,
+        NULL/*ntfs_collate_unicode_string*/,
+};
+static ntfs_collate_func_t ntfs_do_collate0x1[4] = {
+        ntfs_collate_ntofs_ulong,
+        NULL/*ntfs_collate_ntofs_sid*/,
+        NULL/*ntfs_collate_ntofs_security_hash*/,
+        NULL/*ntfs_collate_ntofs_ulongs*/,
+};
+/**
+ * ntfs_collate - collate two data items using a specified collation rule
+ * @vol:        ntfs volume to which the data items belong
+ * @cr:         collation rule to use when comparing the items
+ * @data1:      first data item to collate
+ * @data1_len:  length in bytes of @data1
+ * @data2:      second data item to collate
+ * @data2_len:  length in bytes of @data2
+ *
+ * Collate the two data items @data1 and @data2 using the collation rule @cr
+ * and return -1, 0, ir 1 if @data1 is found, respectively, to collate before,
+ * to match, or to collate after @data2.
+ *
+ * For speed we use the collation rule @cr as an index into two tables of
+ * function pointers to call the appropriate collation function.
+ */
+int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
+                const void *data1, const int data1_len,
+                const void *data2, const int data2_len) {
+        int i;
+        ntfs_debug("Entering.");
+        /*
+         * FIXME:  At the moment we only support COLLATION_BINARY and
+         * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now.
+         */
+        BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG);
+        i = le32_to_cpu(cr);
+        BUG_ON(i < 0);
+        if (i <= 0x02)
+                return ntfs_do_collate0x0[i](vol, data1, data1_len,
+                                data2, data2_len);
+        BUG_ON(i < 0x10);
+        i -= 0x10;
+        if (likely(i <= 3))
+                return ntfs_do_collate0x1[i](vol, data1, data1_len,
+                                data2, data2_len);
+        BUG();
+        return 0;
+}
diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h
new file mode 100644
index 000000000000..e027f36fcc2f
--- /dev/null
+++ b/fs/ntfs/collate.h
@@ -0,0 +1,50 @@
+/*
+ * collate.h - Defines for NTFS kernel collation handling.  Part of the
+ *             Linux-NTFS project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_COLLATE_H
+#define _LINUX_NTFS_COLLATE_H
+#include "types.h"
+#include "volume.h"
+static inline BOOL ntfs_is_collation_rule_supported(COLLATION_RULE cr) {
+        int i;
+        /*
+         * FIXME:  At the moment we only support COLLATION_BINARY and
+         * COLLATION_NTOFS_ULONG, so we return false for everything else for
+         * now.
+         */
+        if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG))
+                return FALSE;
+        i = le32_to_cpu(cr);
+        if (likely(((i >= 0) && (i <= 0x02)) ||
+                        ((i >= 0x10) && (i <= 0x13))))
+                return TRUE;
+        return FALSE;
+}
+extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
+                const void *data1, const int data1_len,
+                const void *data2, const int data2_len);
+#endif /* _LINUX_NTFS_COLLATE_H */
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
new file mode 100644
index 000000000000..ee5ae706f861
--- /dev/null
+++ b/fs/ntfs/compress.c
@@ -0,0 +1,957 @@
+/**
+ * compress.c - NTFS kernel compressed attributes handling.
+ *              Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include "attrib.h"
+#include "inode.h"
+#include "debug.h"
+#include "ntfs.h"
+/**
+ * ntfs_compression_constants - enum of constants used in the compression code
+ */
+typedef enum {
+        /* Token types and access mask. */
+        NTFS_SYMBOL_TOKEN       =       0,
+        NTFS_PHRASE_TOKEN       =       1,
+        NTFS_TOKEN_MASK         =       1,
+        /* Compression sub-block constants. */
+        NTFS_SB_SIZE_MASK       =       0x0fff,
+        NTFS_SB_SIZE            =       0x1000,
+        NTFS_SB_IS_COMPRESSED   =       0x8000,
+        /*
+         * The maximum compression block size is by definition 16 * the cluster
+         * size, with the maximum supported cluster size being 4kiB. Thus the
+         * maximum compression buffer size is 64kiB, so we use this when
+         * initializing the compression buffer.
+         */
+        NTFS_MAX_CB_SIZE        = 64 * 1024,
+} ntfs_compression_constants;
+/**
+ * ntfs_compression_buffer - one buffer for the decompression engine
+ */
+static u8 *ntfs_compression_buffer = NULL;
+/**
+ * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer
+ */
+static DEFINE_SPINLOCK(ntfs_cb_lock);
+/**
+ * allocate_compression_buffers - allocate the decompression buffers
+ *
+ * Caller has to hold the ntfs_lock semaphore.
+ *
+ * Return 0 on success or -ENOMEM if the allocations failed.
+ */
+int allocate_compression_buffers(void)
+{
+        BUG_ON(ntfs_compression_buffer);
+        ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE);
+        if (!ntfs_compression_buffer)
+                return -ENOMEM;
+        return 0;
+}
+/**
+ * free_compression_buffers - free the decompression buffers
+ *
+ * Caller has to hold the ntfs_lock semaphore.
+ */
+void free_compression_buffers(void)
+{
+        BUG_ON(!ntfs_compression_buffer);
+        vfree(ntfs_compression_buffer);
+        ntfs_compression_buffer = NULL;
+}
+/**
+ * zero_partial_compressed_page - zero out of bounds compressed page region
+ */
+static void zero_partial_compressed_page(ntfs_inode *ni, struct page *page)
+{
+        u8 *kp = page_address(page);
+        unsigned int kp_ofs;
+        ntfs_debug("Zeroing page region outside initialized size.");
+        if (((s64)page->index << PAGE_CACHE_SHIFT) >= ni->initialized_size) {
+                /*
+                 * FIXME: Using clear_page() will become wrong when we get
+                 * PAGE_CACHE_SIZE != PAGE_SIZE but for now there is no problem.
+                 */
+                clear_page(kp);
+                return;
+        }
+        kp_ofs = ni->initialized_size & ~PAGE_CACHE_MASK;
+        memset(kp + kp_ofs, 0, PAGE_CACHE_SIZE - kp_ofs);
+        return;
+}
+/**
+ * handle_bounds_compressed_page - test for&handle out of bounds compressed page
+ */
+static inline void handle_bounds_compressed_page(ntfs_inode *ni,
+                struct page *page)
+{
+        if ((page->index >= (ni->initialized_size >> PAGE_CACHE_SHIFT)) &&
+                        (ni->initialized_size < VFS_I(ni)->i_size))
+                zero_partial_compressed_page(ni, page);
+        return;
+}
+/**
+ * ntfs_decompress - decompress a compression block into an array of pages
+ * @dest_pages:         destination array of pages
+ * @dest_index:         current index into @dest_pages (IN/OUT)
+ * @dest_ofs:           current offset within @dest_pages[@dest_index] (IN/OUT)
+ * @dest_max_index:     maximum index into @dest_pages (IN)
+ * @dest_max_ofs:       maximum offset within @dest_pages[@dest_max_index] (IN)
+ * @xpage:              the target page (-1 if none) (IN)
+ * @xpage_done:         set to 1 if xpage was completed successfully (IN/OUT)
+ * @cb_start:           compression block to decompress (IN)
+ * @cb_size:            size of compression block @cb_start in bytes (IN)
+ *
+ * The caller must have disabled preemption. ntfs_decompress() reenables it when
+ * the critical section is finished.
+ *
+ * This decompresses the compression block @cb_start into the array of
+ * destination pages @dest_pages starting at index @dest_index into @dest_pages
+ * and at offset @dest_pos into the page @dest_pages[@dest_index].
+ *
+ * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1.
+ * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified.
+ *
+ * @cb_start is a pointer to the compression block which needs decompressing
+ * and @cb_size is the size of @cb_start in bytes (8-64kiB).
+ *
+ * Return 0 if success or -EOVERFLOW on error in the compressed stream.
+ * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was
+ * completed during the decompression of the compression block (@cb_start).
+ *
+ * Warning: This function *REQUIRES* PAGE_CACHE_SIZE >= 4096 or it will blow up
+ * unpredicatbly! You have been warned!
+ *
+ * Note to hackers: This function may not sleep until it has finished accessing
+ * the compression block @cb_start as it is a per-CPU buffer.
+ */
+static int ntfs_decompress(struct page *dest_pages[], int *dest_index,
+                int *dest_ofs, const int dest_max_index, const int dest_max_ofs,
+                const int xpage, char *xpage_done, u8 *const cb_start,
+                const u32 cb_size)
+{
+        /*
+         * Pointers into the compressed data, i.e. the compression block (cb),
+         * and the therein contained sub-blocks (sb).
+         */
+        u8 *cb_end = cb_start + cb_size; /* End of cb. */
+        u8 *cb = cb_start;      /* Current position in cb. */
+        u8 *cb_sb_start = cb;   /* Beginning of the current sb in the cb. */
+        u8 *cb_sb_end;          /* End of current sb / beginning of next sb. */
+        /* Variables for uncompressed data / destination. */
+        struct page *dp;        /* Current destination page being worked on. */
+        u8 *dp_addr;            /* Current pointer into dp. */
+        u8 *dp_sb_start;        /* Start of current sub-block in dp. */
+        u8 *dp_sb_end;          /* End of current sb in dp (dp_sb_start +
+                                   NTFS_SB_SIZE). */
+        u16 do_sb_start;        /* @dest_ofs when starting this sub-block. */
+        u16 do_sb_end;          /* @dest_ofs of end of this sb (do_sb_start +
+                                   NTFS_SB_SIZE). */
+        /* Variables for tag and token parsing. */
+        u8 tag;                 /* Current tag. */
+        int token;              /* Loop counter for the eight tokens in tag. */
+        /* Need this because we can't sleep, so need two stages. */
+        int completed_pages[dest_max_index - *dest_index + 1];
+        int nr_completed_pages = 0;
+        /* Default error code. */
+        int err = -EOVERFLOW;
+        ntfs_debug("Entering, cb_size = 0x%x.", cb_size);
+do_next_sb:
+        ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.",
+                        cb - cb_start);
+        /*
+         * Have we reached the end of the compression block or the end of the
+         * decompressed data?  The latter can happen for example if the current
+         * position in the compression block is one byte before its end so the
+         * first two checks do not detect it.
+         */
+        if (cb == cb_end || !le16_to_cpup((le16*)cb) ||
+                        (*dest_index == dest_max_index &&
+                        *dest_ofs == dest_max_ofs)) {
+                int i;
+                ntfs_debug("Completed. Returning success (0).");
+                err = 0;
+return_error:
+                /* We can sleep from now on, so we drop lock. */
+                spin_unlock(&ntfs_cb_lock);
+                /* Second stage: finalize completed pages. */
+                if (nr_completed_pages > 0) {
+                        struct page *page = dest_pages[completed_pages[0]];
+                        ntfs_inode *ni = NTFS_I(page->mapping->host);
+                        for (i = 0; i < nr_completed_pages; i++) {
+                                int di = completed_pages[i];
+                                dp = dest_pages[di];
+                                /*
+                                 * If we are outside the initialized size, zero
+                                 * the out of bounds page range.
+                                 */
+                                handle_bounds_compressed_page(ni, dp);
+                                flush_dcache_page(dp);
+                                kunmap(dp);
+                                SetPageUptodate(dp);
+                                unlock_page(dp);
+                                if (di == xpage)
+                                        *xpage_done = 1;
+                                else
+                                        page_cache_release(dp);
+                                dest_pages[di] = NULL;
+                        }
+                }
+                return err;
+        }
+        /* Setup offsets for the current sub-block destination. */
+        do_sb_start = *dest_ofs;
+        do_sb_end = do_sb_start + NTFS_SB_SIZE;
+        /* Check that we are still within allowed boundaries. */
+        if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs)
+                goto return_overflow;
+        /* Does the minimum size of a compressed sb overflow valid range? */
+        if (cb + 6 > cb_end)
+                goto return_overflow;
+        /* Setup the current sub-block source pointers and validate range. */
+        cb_sb_start = cb;
+        cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK)
+                        + 3;
+        if (cb_sb_end > cb_end)
+                goto return_overflow;
+        /* Get the current destination page. */
+        dp = dest_pages[*dest_index];
+        if (!dp) {
+                /* No page present. Skip decompression of this sub-block. */
+                cb = cb_sb_end;
+                /* Advance destination position to next sub-block. */
+                *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_CACHE_MASK;
+                if (!*dest_ofs && (++*dest_index > dest_max_index))
+                        goto return_overflow;
+                goto do_next_sb;
+        }
+        /* We have a valid destination page. Setup the destination pointers. */
+        dp_addr = (u8*)page_address(dp) + do_sb_start;
+        /* Now, we are ready to process the current sub-block (sb). */
+        if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) {
+                ntfs_debug("Found uncompressed sub-block.");
+                /* This sb is not compressed, just copy it into destination. */
+                /* Advance source position to first data byte. */
+                cb += 2;
+                /* An uncompressed sb must be full size. */
+                if (cb_sb_end - cb != NTFS_SB_SIZE)
+                        goto return_overflow;
+                /* Copy the block and advance the source position. */
+                memcpy(dp_addr, cb, NTFS_SB_SIZE);
+                cb += NTFS_SB_SIZE;
+                /* Advance destination position to next sub-block. */
+                *dest_ofs += NTFS_SB_SIZE;
+                if (!(*dest_ofs &= ~PAGE_CACHE_MASK)) {
+finalize_page:
+                        /*
+                         * First stage: add current page index to array of
+                         * completed pages.
+                         */
+                        completed_pages[nr_completed_pages++] = *dest_index;
+                        if (++*dest_index > dest_max_index)
+                                goto return_overflow;
+                }
+                goto do_next_sb;
+        }
+        ntfs_debug("Found compressed sub-block.");
+        /* This sb is compressed, decompress it into destination. */
+        /* Setup destination pointers. */
+        dp_sb_start = dp_addr;
+        dp_sb_end = dp_sb_start + NTFS_SB_SIZE;
+        /* Forward to the first tag in the sub-block. */
+        cb += 2;
+do_next_tag:
+        if (cb == cb_sb_end) {
+                /* Check if the decompressed sub-block was not full-length. */
+                if (dp_addr < dp_sb_end) {
+                        int nr_bytes = do_sb_end - *dest_ofs;
+                        ntfs_debug("Filling incomplete sub-block with "
+                                        "zeroes.");
+                        /* Zero remainder and update destination position. */
+                        memset(dp_addr, 0, nr_bytes);
+                        *dest_ofs += nr_bytes;
+                }
+                /* We have finished the current sub-block. */
+                if (!(*dest_ofs &= ~PAGE_CACHE_MASK))
+                        goto finalize_page;
+                goto do_next_sb;
+        }
+        /* Check we are still in range. */
+        if (cb > cb_sb_end || dp_addr > dp_sb_end)
+                goto return_overflow;
+        /* Get the next tag and advance to first token. */
+        tag = *cb++;
+        /* Parse the eight tokens described by the tag. */
+        for (token = 0; token < 8; token++, tag >>= 1) {
+                u16 lg, pt, length, max_non_overlap;
+                register u16 i;
+                u8 *dp_back_addr;
+                /* Check if we are done / still in range. */
+                if (cb >= cb_sb_end || dp_addr > dp_sb_end)
+                        break;
+                /* Determine token type and parse appropriately.*/
+                if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) {
+                        /*
+                         * We have a symbol token, copy the symbol across, and
+                         * advance the source and destination positions.
+                         */
+                        *dp_addr++ = *cb++;
+                        ++*dest_ofs;
+                        /* Continue with the next token. */
+                        continue;
+                }
+                /*
+                 * We have a phrase token. Make sure it is not the first tag in
+                 * the sb as this is illegal and would confuse the code below.
+                 */
+                if (dp_addr == dp_sb_start)
+                        goto return_overflow;
+                /*
+                 * Determine the number of bytes to go back (p) and the number
+                 * of bytes to copy (l). We use an optimized algorithm in which
+                 * we first calculate log2(current destination position in sb),
+                 * which allows determination of l and p in O(1) rather than
+                 * O(n). We just need an arch-optimized log2() function now.
+                 */
+                lg = 0;
+                for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1)
+                        lg++;
+                /* Get the phrase token into i. */
+                pt = le16_to_cpup((le16*)cb);
+                /*
+                 * Calculate starting position of the byte sequence in
+                 * the destination using the fact that p = (pt >> (12 - lg)) + 1
+                 * and make sure we don't go too far back.
+                 */
+                dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1;
+                if (dp_back_addr < dp_sb_start)
+                        goto return_overflow;
+                /* Now calculate the length of the byte sequence. */
+                length = (pt & (0xfff >> lg)) + 3;
+                /* Advance destination position and verify it is in range. */
+                *dest_ofs += length;
+                if (*dest_ofs > do_sb_end)
+                        goto return_overflow;
+                /* The number of non-overlapping bytes. */
+                max_non_overlap = dp_addr - dp_back_addr;
+                if (length <= max_non_overlap) {
+                        /* The byte sequence doesn't overlap, just copy it. */
+                        memcpy(dp_addr, dp_back_addr, length);
+                        /* Advance destination pointer. */
+                        dp_addr += length;
+                } else {
+                        /*
+                         * The byte sequence does overlap, copy non-overlapping
+                         * part and then do a slow byte by byte copy for the
+                         * overlapping part. Also, advance the destination
+                         * pointer.
+                         */
+                        memcpy(dp_addr, dp_back_addr, max_non_overlap);
+                        dp_addr += max_non_overlap;
+                        dp_back_addr += max_non_overlap;
+                        length -= max_non_overlap;
+                        while (length--)
+                                *dp_addr++ = *dp_back_addr++;
+                }
+                /* Advance source position and continue with the next token. */
+                cb += 2;
+        }
+        /* No tokens left in the current tag. Continue with the next tag. */
+        goto do_next_tag;
+return_overflow:
+        ntfs_error(NULL, "Failed. Returning -EOVERFLOW.");
+        goto return_error;
+}
+/**
+ * ntfs_read_compressed_block - read a compressed block into the page cache
+ * @page:       locked page in the compression block(s) we need to read
+ *
+ * When we are called the page has already been verified to be locked and the
+ * attribute is known to be non-resident, not encrypted, but compressed.
+ *
+ * 1. Determine which compression block(s) @page is in.
+ * 2. Get hold of all pages corresponding to this/these compression block(s).
+ * 3. Read the (first) compression block.
+ * 4. Decompress it into the corresponding pages.
+ * 5. Throw the compressed data away and proceed to 3. for the next compression
+ *    block or return success if no more compression blocks left.
+ *
+ * Warning: We have to be careful what we do about existing pages. They might
+ * have been written to so that we would lose data if we were to just overwrite
+ * them with the out-of-date uncompressed data.
+ *
+ * FIXME: For PAGE_CACHE_SIZE > cb_size we are not doing the Right Thing(TM) at
+ * the end of the file I think. We need to detect this case and zero the out
+ * of bounds remainder of the page in question and mark it as handled. At the
+ * moment we would just return -EIO on such a page. This bug will only become
+ * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte
+ * clusters so is probably not going to be seen by anyone. Still this should
+ * be fixed. (AIA)
+ *
+ * FIXME: Again for PAGE_CACHE_SIZE > cb_size we are screwing up both in
+ * handling sparse and compressed cbs. (AIA)
+ *
+ * FIXME: At the moment we don't do any zeroing out in the case that
+ * initialized_size is less than data_size. This should be safe because of the
+ * nature of the compression algorithm used. Just in case we check and output
+ * an error message in read inode if the two sizes are not equal for a
+ * compressed file. (AIA)
+ */
+int ntfs_read_compressed_block(struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        ntfs_inode *ni = NTFS_I(mapping->host);
+        ntfs_volume *vol = ni->vol;
+        struct super_block *sb = vol->sb;
+        runlist_element *rl;
+        unsigned long block_size = sb->s_blocksize;
+        unsigned char block_size_bits = sb->s_blocksize_bits;
+        u8 *cb, *cb_pos, *cb_end;
+        struct buffer_head **bhs;
+        unsigned long offset, index = page->index;
+        u32 cb_size = ni->itype.compressed.block_size;
+        u64 cb_size_mask = cb_size - 1UL;
+        VCN vcn;
+        LCN lcn;
+        /* The first wanted vcn (minimum alignment is PAGE_CACHE_SIZE). */
+        VCN start_vcn = (((s64)index << PAGE_CACHE_SHIFT) & ~cb_size_mask) >>
+                        vol->cluster_size_bits;
+        /*
+         * The first vcn after the last wanted vcn (minumum alignment is again
+         * PAGE_CACHE_SIZE.
+         */
+        VCN end_vcn = ((((s64)(index + 1UL) << PAGE_CACHE_SHIFT) + cb_size - 1)
+                        & ~cb_size_mask) >> vol->cluster_size_bits;
+        /* Number of compression blocks (cbs) in the wanted vcn range. */
+        unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits
+                        >> ni->itype.compressed.block_size_bits;
+        /*
+         * Number of pages required to store the uncompressed data from all
+         * compression blocks (cbs) overlapping @page. Due to alignment
+         * guarantees of start_vcn and end_vcn, no need to round up here.
+         */
+        unsigned int nr_pages = (end_vcn - start_vcn) <<
+                        vol->cluster_size_bits >> PAGE_CACHE_SHIFT;
+        unsigned int xpage, max_page, cur_page, cur_ofs, i;
+        unsigned int cb_clusters, cb_max_ofs;
+        int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0;
+        struct page **pages;
+        unsigned char xpage_done = 0;
+        ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = "
+                        "%i.", index, cb_size, nr_pages);
+        /*
+         * Bad things happen if we get here for anything that is not an
+         * unnamed $DATA attribute.
+         */
+        BUG_ON(ni->type != AT_DATA);
+        BUG_ON(ni->name_len);
+        pages = kmalloc(nr_pages * sizeof(struct page *), GFP_NOFS);
+        /* Allocate memory to store the buffer heads we need. */
+        bhs_size = cb_size / block_size * sizeof(struct buffer_head *);
+        bhs = kmalloc(bhs_size, GFP_NOFS);
+        if (unlikely(!pages || !bhs)) {
+                kfree(bhs);
+                kfree(pages);
+                SetPageError(page);
+                unlock_page(page);
+                ntfs_error(vol->sb, "Failed to allocate internal buffers.");
+                return -ENOMEM;
+        }
+        /*
+         * We have already been given one page, this is the one we must do.
+         * Once again, the alignment guarantees keep it simple.
+         */
+        offset = start_vcn << vol->cluster_size_bits >> PAGE_CACHE_SHIFT;
+        xpage = index - offset;
+        pages[xpage] = page;
+        /*
+         * The remaining pages need to be allocated and inserted into the page
+         * cache, alignment guarantees keep all the below much simpler. (-8
+         */
+        max_page = ((VFS_I(ni)->i_size + PAGE_CACHE_SIZE - 1) >>
+                        PAGE_CACHE_SHIFT) - offset;
+        if (nr_pages < max_page)
+                max_page = nr_pages;
+        for (i = 0; i < max_page; i++, offset++) {
+                if (i != xpage)
+                        pages[i] = grab_cache_page_nowait(mapping, offset);
+                page = pages[i];
+                if (page) {
+                        /*
+                         * We only (re)read the page if it isn't already read
+                         * in and/or dirty or we would be losing data or at
+                         * least wasting our time.
+                         */
+                        if (!PageDirty(page) && (!PageUptodate(page) ||
+                                        PageError(page))) {
+                                ClearPageError(page);
+                                kmap(page);
+                                continue;
+                        }
+                        unlock_page(page);
+                        page_cache_release(page);
+                        pages[i] = NULL;
+                }
+        }
+        /*
+         * We have the runlist, and all the destination pages we need to fill.
+         * Now read the first compression block.
+         */
+        cur_page = 0;
+        cur_ofs = 0;
+        cb_clusters = ni->itype.compressed.block_clusters;
+do_next_cb:
+        nr_cbs--;
+        nr_bhs = 0;
+        /* Read all cb buffer heads one cluster at a time. */
+        rl = NULL;
+        for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn;
+                        vcn++) {
+                BOOL is_retry = FALSE;
+                if (!rl) {
+lock_retry_remap:
+                        down_read(&ni->runlist.lock);
+                        rl = ni->runlist.rl;
+                }
+                if (likely(rl != NULL)) {
+                        /* Seek to element containing target vcn. */
+                        while (rl->length && rl[1].vcn <= vcn)
+                                rl++;
+                        lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+                } else
+                        lcn = LCN_RL_NOT_MAPPED;
+                ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
+                                (unsigned long long)vcn,
+                                (unsigned long long)lcn);
+                if (lcn < 0) {
+                        /*
+                         * When we reach the first sparse cluster we have
+                         * finished with the cb.
+                         */
+                        if (lcn == LCN_HOLE)
+                                break;
+                        if (is_retry || lcn != LCN_RL_NOT_MAPPED)
+                                goto rl_err;
+                        is_retry = TRUE;
+                        /*
+                         * Attempt to map runlist, dropping lock for the
+                         * duration.
+                         */
+                        up_read(&ni->runlist.lock);
+                        if (!ntfs_map_runlist(ni, vcn))
+                                goto lock_retry_remap;
+                        goto map_rl_err;
+                }
+                block = lcn << vol->cluster_size_bits >> block_size_bits;
+                /* Read the lcn from device in chunks of block_size bytes. */
+                max_block = block + (vol->cluster_size >> block_size_bits);
+                do {
+                        ntfs_debug("block = 0x%x.", block);
+                        if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block))))
+                                goto getblk_err;
+                        nr_bhs++;
+                } while (++block < max_block);
+        }
+        /* Release the lock if we took it. */
+        if (rl)
+                up_read(&ni->runlist.lock);
+        /* Setup and initiate io on all buffer heads. */
+        for (i = 0; i < nr_bhs; i++) {
+                struct buffer_head *tbh = bhs[i];
+                if (unlikely(test_set_buffer_locked(tbh)))
+                        continue;
+                if (unlikely(buffer_uptodate(tbh))) {
+                        unlock_buffer(tbh);
+                        continue;
+                }
+                get_bh(tbh);
+                tbh->b_end_io = end_buffer_read_sync;
+                submit_bh(READ, tbh);
+        }
+        /* Wait for io completion on all buffer heads. */
+        for (i = 0; i < nr_bhs; i++) {
+                struct buffer_head *tbh = bhs[i];
+                if (buffer_uptodate(tbh))
+                        continue;
+                wait_on_buffer(tbh);
+                /*
+                 * We need an optimization barrier here, otherwise we start
+                 * hitting the below fixup code when accessing a loopback
+                 * mounted ntfs partition. This indicates either there is a
+                 * race condition in the loop driver or, more likely, gcc
+                 * overoptimises the code without the barrier and it doesn't
+                 * do the Right Thing(TM).
+                 */
+                barrier();
+                if (unlikely(!buffer_uptodate(tbh))) {
+                        ntfs_warning(vol->sb, "Buffer is unlocked but not "
+                                        "uptodate! Unplugging the disk queue "
+                                        "and rescheduling.");
+                        get_bh(tbh);
+                        blk_run_address_space(mapping);
+                        schedule();
+                        put_bh(tbh);
+                        if (unlikely(!buffer_uptodate(tbh)))
+                                goto read_err;
+                        ntfs_warning(vol->sb, "Buffer is now uptodate. Good.");
+                }
+        }
+        /*
+         * Get the compression buffer. We must not sleep any more
+         * until we are finished with it.
+         */
+        spin_lock(&ntfs_cb_lock);
+        cb = ntfs_compression_buffer;
+        BUG_ON(!cb);
+        cb_pos = cb;
+        cb_end = cb + cb_size;
+        /* Copy the buffer heads into the contiguous buffer. */
+        for (i = 0; i < nr_bhs; i++) {
+                memcpy(cb_pos, bhs[i]->b_data, block_size);
+                cb_pos += block_size;
+        }
+        /* Just a precaution. */
+        if (cb_pos + 2 <= cb + cb_size)
+                *(u16*)cb_pos = 0;
+        /* Reset cb_pos back to the beginning. */
+        cb_pos = cb;
+        /* We now have both source (if present) and destination. */
+        ntfs_debug("Successfully read the compression block.");
+        /* The last page and maximum offset within it for the current cb. */
+        cb_max_page = (cur_page << PAGE_CACHE_SHIFT) + cur_ofs + cb_size;
+        cb_max_ofs = cb_max_page & ~PAGE_CACHE_MASK;
+        cb_max_page >>= PAGE_CACHE_SHIFT;
+        /* Catch end of file inside a compression block. */
+        if (cb_max_page > max_page)
+                cb_max_page = max_page;
+        if (vcn == start_vcn - cb_clusters) {
+                /* Sparse cb, zero out page range overlapping the cb. */
+                ntfs_debug("Found sparse compression block.");
+                /* We can sleep from now on, so we drop lock. */
+                spin_unlock(&ntfs_cb_lock);
+                if (cb_max_ofs)
+                        cb_max_page--;
+                for (; cur_page < cb_max_page; cur_page++) {
+                        page = pages[cur_page];
+                        if (page) {
+                                /*
+                                 * FIXME: Using clear_page() will become wrong
+                                 * when we get PAGE_CACHE_SIZE != PAGE_SIZE but
+                                 * for now there is no problem.
+                                 */
+                                if (likely(!cur_ofs))
+                                        clear_page(page_address(page));
+                                else
+                                        memset(page_address(page) + cur_ofs, 0,
+                                                        PAGE_CACHE_SIZE -
+                                                        cur_ofs);
+                                flush_dcache_page(page);
+                                kunmap(page);
+                                SetPageUptodate(page);
+                                unlock_page(page);
+                                if (cur_page == xpage)
+                                        xpage_done = 1;
+                                else
+                                        page_cache_release(page);
+                                pages[cur_page] = NULL;
+                        }
+                        cb_pos += PAGE_CACHE_SIZE - cur_ofs;
+                        cur_ofs = 0;
+                        if (cb_pos >= cb_end)
+                                break;
+                }
+                /* If we have a partial final page, deal with it now. */
+                if (cb_max_ofs && cb_pos < cb_end) {
+                        page = pages[cur_page];
+                        if (page)
+                                memset(page_address(page) + cur_ofs, 0,
+                                                cb_max_ofs - cur_ofs);
+                        /*
+                         * No need to update cb_pos at this stage:
+                         *      cb_pos += cb_max_ofs - cur_ofs;
+                         */
+                        cur_ofs = cb_max_ofs;
+                }
+        } else if (vcn == start_vcn) {
+                /* We can't sleep so we need two stages. */
+                unsigned int cur2_page = cur_page;
+                unsigned int cur_ofs2 = cur_ofs;
+                u8 *cb_pos2 = cb_pos;
+                ntfs_debug("Found uncompressed compression block.");
+                /* Uncompressed cb, copy it to the destination pages. */
+                /*
+                 * TODO: As a big optimization, we could detect this case
+                 * before we read all the pages and use block_read_full_page()
+                 * on all full pages instead (we still have to treat partial
+                 * pages especially but at least we are getting rid of the
+                 * synchronous io for the majority of pages.
+                 * Or if we choose not to do the read-ahead/-behind stuff, we
+                 * could just return block_read_full_page(pages[xpage]) as long
+                 * as PAGE_CACHE_SIZE <= cb_size.
+                 */
+                if (cb_max_ofs)
+                        cb_max_page--;
+                /* First stage: copy data into destination pages. */
+                for (; cur_page < cb_max_page; cur_page++) {
+                        page = pages[cur_page];
+                        if (page)
+                                memcpy(page_address(page) + cur_ofs, cb_pos,
+                                                PAGE_CACHE_SIZE - cur_ofs);
+                        cb_pos += PAGE_CACHE_SIZE - cur_ofs;
+                        cur_ofs = 0;
+                        if (cb_pos >= cb_end)
+                                break;
+                }
+                /* If we have a partial final page, deal with it now. */
+                if (cb_max_ofs && cb_pos < cb_end) {
+                        page = pages[cur_page];
+                        if (page)
+                                memcpy(page_address(page) + cur_ofs, cb_pos,
+                                                cb_max_ofs - cur_ofs);
+                        cb_pos += cb_max_ofs - cur_ofs;
+                        cur_ofs = cb_max_ofs;
+                }
+                /* We can sleep from now on, so drop lock. */
+                spin_unlock(&ntfs_cb_lock);
+                /* Second stage: finalize pages. */
+                for (; cur2_page < cb_max_page; cur2_page++) {
+                        page = pages[cur2_page];
+                        if (page) {
+                                /*
+                                 * If we are outside the initialized size, zero
+                                 * the out of bounds page range.
+                                 */
+                                handle_bounds_compressed_page(ni, page);
+                                flush_dcache_page(page);
+                                kunmap(page);
+                                SetPageUptodate(page);
+                                unlock_page(page);
+                                if (cur2_page == xpage)
+                                        xpage_done = 1;
+                                else
+                                        page_cache_release(page);
+                                pages[cur2_page] = NULL;
+                        }
+                        cb_pos2 += PAGE_CACHE_SIZE - cur_ofs2;
+                        cur_ofs2 = 0;
+                        if (cb_pos2 >= cb_end)
+                                break;
+                }
+        } else {
+                /* Compressed cb, decompress it into the destination page(s). */
+                unsigned int prev_cur_page = cur_page;
+                ntfs_debug("Found compressed compression block.");
+                err = ntfs_decompress(pages, &cur_page, &cur_ofs,
+                                cb_max_page, cb_max_ofs, xpage, &xpage_done,
+                                cb_pos, cb_size - (cb_pos - cb));
+                /*
+                 * We can sleep from now on, lock already dropped by
+                 * ntfs_decompress().
+                 */
+                if (err) {
+                        ntfs_error(vol->sb, "ntfs_decompress() failed in inode "
+                                        "0x%lx with error code %i. Skipping "
+                                        "this compression block.",
+                                        ni->mft_no, -err);
+                        /* Release the unfinished pages. */
+                        for (; prev_cur_page < cur_page; prev_cur_page++) {
+                                page = pages[prev_cur_page];
+                                if (page) {
+                                        if (prev_cur_page == xpage &&
+                                                        !xpage_done)
+                                                SetPageError(page);
+                                        flush_dcache_page(page);
+                                        kunmap(page);
+                                        unlock_page(page);
+                                        if (prev_cur_page != xpage)
+                                                page_cache_release(page);
+                                        pages[prev_cur_page] = NULL;
+                                }
+                        }
+                }
+        }
+        /* Release the buffer heads. */
+        for (i = 0; i < nr_bhs; i++)
+                brelse(bhs[i]);
+        /* Do we have more work to do? */
+        if (nr_cbs)
+                goto do_next_cb;
+        /* We no longer need the list of buffer heads. */
+        kfree(bhs);
+        /* Clean up if we have any pages left. Should never happen. */
+        for (cur_page = 0; cur_page < max_page; cur_page++) {
+                page = pages[cur_page];
+                if (page) {
+                        ntfs_error(vol->sb, "Still have pages left! "
+                                        "Terminating them with extreme "
+                                        "prejudice.  Inode 0x%lx, page index "
+                                        "0x%lx.", ni->mft_no, page->index);
+                        if (cur_page == xpage && !xpage_done)
+                                SetPageError(page);
+                        flush_dcache_page(page);
+                        kunmap(page);
+                        unlock_page(page);
+                        if (cur_page != xpage)
+                                page_cache_release(page);
+                        pages[cur_page] = NULL;
+                }
+        }
+        /* We no longer need the list of pages. */
+        kfree(pages);
+        /* If we have completed the requested page, we return success. */
+        if (likely(xpage_done))
+                return 0;
+        ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
+                        "EOVERFLOW" : (!err ? "EIO" : "unkown error"));
+        return err < 0 ? err : -EIO;
+read_err:
+        ntfs_error(vol->sb, "IO error while reading compressed data.");
+        /* Release the buffer heads. */
+        for (i = 0; i < nr_bhs; i++)
+                brelse(bhs[i]);
+        goto err_out;
+map_rl_err:
+        ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read "
+                        "compression block.");
+        goto err_out;
+rl_err:
+        up_read(&ni->runlist.lock);
+        ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read "
+                        "compression block.");
+        goto err_out;
+getblk_err:
+        up_read(&ni->runlist.lock);
+        ntfs_error(vol->sb, "getblk() failed. Cannot read compression block.");
+err_out:
+        kfree(bhs);
+        for (i = cur_page; i < max_page; i++) {
+                page = pages[i];
+                if (page) {
+                        if (i == xpage && !xpage_done)
+                                SetPageError(page);
+                        flush_dcache_page(page);
+                        kunmap(page);
+                        unlock_page(page);
+                        if (i != xpage)
+                                page_cache_release(page);
+                }
+        }
+        kfree(pages);
+        return -EIO;
+}
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
new file mode 100644
index 000000000000..6fb6bb5e3723
--- /dev/null
+++ b/fs/ntfs/debug.c
@@ -0,0 +1,180 @@
+/*
+ * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include "debug.h"
+/*
+ * A static buffer to hold the error string being displayed and a spinlock
+ * to protect concurrent accesses to it.
+ */
+static char err_buf[1024];
+static DEFINE_SPINLOCK(err_buf_lock);
+/**
+ * __ntfs_warning - output a warning to the syslog
+ * @function:   name of function outputting the warning
+ * @sb:         super block of mounted ntfs filesystem
+ * @fmt:        warning string containing format specifications
+ * @...:        a variable number of arguments specified in @fmt
+ *
+ * Outputs a warning to the syslog for the mounted ntfs filesystem described
+ * by @sb.
+ *
+ * @fmt and the corresponding @... is printf style format string containing
+ * the warning string and the corresponding format arguments, respectively.
+ *
+ * @function is the name of the function from which __ntfs_warning is being
+ * called.
+ *
+ * Note, you should be using debug.h::ntfs_warning(@sb, @fmt, @...) instead
+ * as this provides the @function parameter automatically.
+ */
+void __ntfs_warning(const char *function, const struct super_block *sb,
+                const char *fmt, ...)
+{
+        va_list args;
+        int flen = 0;
+#ifndef DEBUG
+        if (!printk_ratelimit())
+                return;
+#endif
+        if (function)
+                flen = strlen(function);
+        spin_lock(&err_buf_lock);
+        va_start(args, fmt);
+        vsnprintf(err_buf, sizeof(err_buf), fmt, args);
+        va_end(args);
+        if (sb)
+                printk(KERN_ERR "NTFS-fs warning (device %s): %s(): %s\n",
+                                sb->s_id, flen ? function : "", err_buf);
+        else
+                printk(KERN_ERR "NTFS-fs warning: %s(): %s\n",
+                                flen ? function : "", err_buf);
+        spin_unlock(&err_buf_lock);
+}
+/**
+ * __ntfs_error - output an error to the syslog
+ * @function:   name of function outputting the error
+ * @sb:         super block of mounted ntfs filesystem
+ * @fmt:        error string containing format specifications
+ * @...:        a variable number of arguments specified in @fmt
+ *
+ * Outputs an error to the syslog for the mounted ntfs filesystem described
+ * by @sb.
+ *
+ * @fmt and the corresponding @... is printf style format string containing
+ * the error string and the corresponding format arguments, respectively.
+ *
+ * @function is the name of the function from which __ntfs_error is being
+ * called.
+ *
+ * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead
+ * as this provides the @function parameter automatically.
+ */
+void __ntfs_error(const char *function, const struct super_block *sb,
+                const char *fmt, ...)
+{
+        va_list args;
+        int flen = 0;
+#ifndef DEBUG
+        if (!printk_ratelimit())
+                return;
+#endif
+        if (function)
+                flen = strlen(function);
+        spin_lock(&err_buf_lock);
+        va_start(args, fmt);
+        vsnprintf(err_buf, sizeof(err_buf), fmt, args);
+        va_end(args);
+        if (sb)
+                printk(KERN_ERR "NTFS-fs error (device %s): %s(): %s\n",
+                                sb->s_id, flen ? function : "", err_buf);
+        else
+                printk(KERN_ERR "NTFS-fs error: %s(): %s\n",
+                                flen ? function : "", err_buf);
+        spin_unlock(&err_buf_lock);
+}
+#ifdef DEBUG
+/* If 1, output debug messages, and if 0, don't. */
+int debug_msgs = 0;
+void __ntfs_debug (const char *file, int line, const char *function,
+                const char *fmt, ...)
+{
+        va_list args;
+        int flen = 0;
+        if (!debug_msgs)
+                return;
+        if (function)
+                flen = strlen(function);
+        spin_lock(&err_buf_lock);
+        va_start(args, fmt);
+        vsnprintf(err_buf, sizeof(err_buf), fmt, args);
+        va_end(args);
+        printk(KERN_DEBUG "NTFS-fs DEBUG (%s, %d): %s(): %s\n", file, line,
+                        flen ? function : "", err_buf);
+        spin_unlock(&err_buf_lock);
+}
+/* Dump a runlist. Caller has to provide synchronisation for @rl. */
+void ntfs_debug_dump_runlist(const runlist_element *rl)
+{
+        int i;
+        const char *lcn_str[5] = { "LCN_HOLE         ", "LCN_RL_NOT_MAPPED",
+                                   "LCN_ENOENT       ", "LCN_unknown      " };
+        if (!debug_msgs)
+                return;
+        printk(KERN_DEBUG "NTFS-fs DEBUG: Dumping runlist (values in hex):\n");
+        if (!rl) {
+                printk(KERN_DEBUG "Run list not present.\n");
+                return;
+        }
+        printk(KERN_DEBUG "VCN              LCN               Run length\n");
+        for (i = 0; ; i++) {
+                LCN lcn = (rl + i)->lcn;
+                if (lcn < (LCN)0) {
+                        int index = -lcn - 1;
+                        if (index > -LCN_ENOENT - 1)
+                                index = 3;
+                        printk(KERN_DEBUG "%-16Lx %s %-16Lx%s\n",
+                                        (rl + i)->vcn, lcn_str[index],
+                                        (rl + i)->length, (rl + i)->length ?
+                                        "" : " (runlist end)");
+                } else
+                        printk(KERN_DEBUG "%-16Lx %-16Lx  %-16Lx%s\n",
+                                        (rl + i)->vcn, (rl + i)->lcn,
+                                        (rl + i)->length, (rl + i)->length ?
+                                        "" : " (runlist end)");
+                if (!(rl + i)->length)
+                        break;
+        }
+}
+#endif
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
new file mode 100644
index 000000000000..8ac37c33d127
--- /dev/null
+++ b/fs/ntfs/debug.h
@@ -0,0 +1,67 @@
+/*
+ * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_DEBUG_H
+#define _LINUX_NTFS_DEBUG_H
+#include <linux/fs.h>
+#include "runlist.h"
+#ifdef DEBUG
+extern int debug_msgs;
+#if 0 /* Fool kernel-doc since it doesn't do macros yet */
+/**
+ * ntfs_debug - write a debug level message to syslog
+ * @f:          a printf format string containing the message
+ * @...:        the variables to substitute into @f
+ *
+ * ntfs_debug() writes a DEBUG level message to the syslog but only if the
+ * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP.
+ */
+static void ntfs_debug(const char *f, ...);
+#endif
+extern void __ntfs_debug (const char *file, int line, const char *function,
+        const char *format, ...) __attribute__ ((format (printf, 4, 5)));
+#define ntfs_debug(f, a...)                                             \
+        __ntfs_debug(__FILE__, __LINE__, __FUNCTION__, f, ##a)
+extern void ntfs_debug_dump_runlist(const runlist_element *rl);
+#else   /* !DEBUG */
+#define ntfs_debug(f, a...)             do {} while (0)
+#define ntfs_debug_dump_runlist(rl)     do {} while (0)
+#endif  /* !DEBUG */
+extern void __ntfs_warning(const char *function, const struct super_block *sb,
+                const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+#define ntfs_warning(sb, f, a...)       __ntfs_warning(__FUNCTION__, sb, f, ##a)
+extern void __ntfs_error(const char *function, const struct super_block *sb,
+                const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+#define ntfs_error(sb, f, a...)         __ntfs_error(__FUNCTION__, sb, f, ##a)
+#endif /* _LINUX_NTFS_DEBUG_H */
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
new file mode 100644
index 000000000000..93577561cdbe
--- /dev/null
+++ b/fs/ntfs/dir.c
@@ -0,0 +1,1569 @@
+/**
+ * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/smp_lock.h>
+#include <linux/buffer_head.h>
+#include "dir.h"
+#include "aops.h"
+#include "attrib.h"
+#include "mft.h"
+#include "debug.h"
+#include "ntfs.h"
+/**
+ * The little endian Unicode string $I30 as a global constant.
+ */
+ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'),
+                const_cpu_to_le16('3'), const_cpu_to_le16('0'), 0 };
+/**
+ * ntfs_lookup_inode_by_name - find an inode in a directory given its name
+ * @dir_ni:     ntfs inode of the directory in which to search for the name
+ * @uname:      Unicode name for which to search in the directory
+ * @uname_len:  length of the name @uname in Unicode characters
+ * @res:        return the found file name if necessary (see below)
+ *
+ * Look for an inode with name @uname in the directory with inode @dir_ni.
+ * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
+ * the Unicode name. If the name is found in the directory, the corresponding
+ * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
+ * is a 64-bit number containing the sequence number.
+ *
+ * On error, a negative value is returned corresponding to the error code. In
+ * particular if the inode is not found -ENOENT is returned. Note that you
+ * can't just check the return value for being negative, you have to check the
+ * inode number for being negative which you can extract using MREC(return
+ * value).
+ *
+ * Note, @uname_len does not include the (optional) terminating NULL character.
+ *
+ * Note, we look for a case sensitive match first but we also look for a case
+ * insensitive match at the same time. If we find a case insensitive match, we
+ * save that for the case that we don't find an exact match, where we return
+ * the case insensitive match and setup @res (which we allocate!) with the mft
+ * reference, the file name type, length and with a copy of the little endian
+ * Unicode file name itself. If we match a file name which is in the DOS name
+ * space, we only return the mft reference and file name type in @res.
+ * ntfs_lookup() then uses this to find the long file name in the inode itself.
+ * This is to avoid polluting the dcache with short file names. We want them to
+ * work but we don't care for how quickly one can access them. This also fixes
+ * the dcache aliasing issues.
+ *
+ * Locking:  - Caller must hold i_sem on the directory.
+ *           - Each page cache page in the index allocation mapping must be
+ *             locked whilst being accessed otherwise we may find a corrupt
+ *             page due to it being under ->writepage at the moment which
+ *             applies the mst protection fixups before writing out and then
+ *             removes them again after the write is complete after which it 
+ *             unlocks the page.
+ */
+MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
+                const int uname_len, ntfs_name **res)
+{
+        ntfs_volume *vol = dir_ni->vol;
+        struct super_block *sb = vol->sb;
+        MFT_RECORD *m;
+        INDEX_ROOT *ir;
+        INDEX_ENTRY *ie;
+        INDEX_ALLOCATION *ia;
+        u8 *index_end;
+        u64 mref;
+        ntfs_attr_search_ctx *ctx;
+        int err, rc;
+        VCN vcn, old_vcn;
+        struct address_space *ia_mapping;
+        struct page *page;
+        u8 *kaddr;
+        ntfs_name *name = NULL;
+        BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode));
+        BUG_ON(NInoAttr(dir_ni));
+        /* Get hold of the mft record for the directory. */
+        m = map_mft_record(dir_ni);
+        if (IS_ERR(m)) {
+                ntfs_error(sb, "map_mft_record() failed with error code %ld.",
+                                -PTR_ERR(m));
+                return ERR_MREF(PTR_ERR(m));
+        }
+        ctx = ntfs_attr_get_search_ctx(dir_ni, m);
+        if (unlikely(!ctx)) {
+                err = -ENOMEM;
+                goto err_out;
+        }
+        /* Find the index root attribute in the mft record. */
+        err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
+                        0, ctx);
+        if (unlikely(err)) {
+                if (err == -ENOENT) {
+                        ntfs_error(sb, "Index root attribute missing in "
+                                        "directory inode 0x%lx.",
+                                        dir_ni->mft_no);
+                        err = -EIO;
+                }
+                goto err_out;
+        }
+        /* Get to the index root value (it's been verified in read_inode). */
+        ir = (INDEX_ROOT*)((u8*)ctx->attr +
+                        le16_to_cpu(ctx->attr->data.resident.value_offset));
+        index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+        /* The first index entry. */
+        ie = (INDEX_ENTRY*)((u8*)&ir->index +
+                        le32_to_cpu(ir->index.entries_offset));
+        /*
+         * Loop until we exceed valid memory (corruption case) or until we
+         * reach the last entry.
+         */
+        for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+                /* Bounds checks. */
+                if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
+                                sizeof(INDEX_ENTRY_HEADER) > index_end ||
+                                (u8*)ie + le16_to_cpu(ie->key_length) >
+                                index_end)
+                        goto dir_err_out;
+                /*
+                 * The last entry cannot contain a name. It can however contain
+                 * a pointer to a child node in the B+tree so we just break out.
+                 */
+                if (ie->flags & INDEX_ENTRY_END)
+                        break;
+                /*
+                 * We perform a case sensitive comparison and if that matches
+                 * we are done and return the mft reference of the inode (i.e.
+                 * the inode number together with the sequence number for
+                 * consistency checking). We convert it to cpu format before
+                 * returning.
+                 */
+                if (ntfs_are_names_equal(uname, uname_len,
+                                (ntfschar*)&ie->key.file_name.file_name,
+                                ie->key.file_name.file_name_length,
+                                CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
+found_it:
+                        /*
+                         * We have a perfect match, so we don't need to care
+                         * about having matched imperfectly before, so we can
+                         * free name and set *res to NULL.
+                         * However, if the perfect match is a short file name,
+                         * we need to signal this through *res, so that
+                         * ntfs_lookup() can fix dcache aliasing issues.
+                         * As an optimization we just reuse an existing
+                         * allocation of *res.
+                         */
+                        if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
+                                if (!name) {
+                                        name = kmalloc(sizeof(ntfs_name),
+                                                        GFP_NOFS);
+                                        if (!name) {
+                                                err = -ENOMEM;
+                                                goto err_out;
+                                        }
+                                }
+                                name->mref = le64_to_cpu(
+                                                ie->data.dir.indexed_file);
+                                name->type = FILE_NAME_DOS;
+                                name->len = 0;
+                                *res = name;
+                        } else {
+                                if (name)
+                                        kfree(name);
+                                *res = NULL;
+                        }
+                        mref = le64_to_cpu(ie->data.dir.indexed_file);
+                        ntfs_attr_put_search_ctx(ctx);
+                        unmap_mft_record(dir_ni);
+                        return mref;
+                }
+                /*
+                 * For a case insensitive mount, we also perform a case
+                 * insensitive comparison (provided the file name is not in the
+                 * POSIX namespace). If the comparison matches, and the name is
+                 * in the WIN32 namespace, we cache the filename in *res so
+                 * that the caller, ntfs_lookup(), can work on it. If the
+                 * comparison matches, and the name is in the DOS namespace, we
+                 * only cache the mft reference and the file name type (we set
+                 * the name length to zero for simplicity).
+                 */
+                if (!NVolCaseSensitive(vol) &&
+                                ie->key.file_name.file_name_type &&
+                                ntfs_are_names_equal(uname, uname_len,
+                                (ntfschar*)&ie->key.file_name.file_name,
+                                ie->key.file_name.file_name_length,
+                                IGNORE_CASE, vol->upcase, vol->upcase_len)) {
+                        int name_size = sizeof(ntfs_name);
+                        u8 type = ie->key.file_name.file_name_type;
+                        u8 len = ie->key.file_name.file_name_length;
+                        /* Only one case insensitive matching name allowed. */
+                        if (name) {
+                                ntfs_error(sb, "Found already allocated name "
+                                                "in phase 1. Please run chkdsk "
+                                                "and if that doesn't find any "
+                                                "errors please report you saw "
+                                                "this message to "
+                                                "linux-ntfs-dev@lists."
+                                                "sourceforge.net.");
+                                goto dir_err_out;
+                        }
+                        if (type != FILE_NAME_DOS)
+                                name_size += len * sizeof(ntfschar);
+                        name = kmalloc(name_size, GFP_NOFS);
+                        if (!name) {
+                                err = -ENOMEM;
+                                goto err_out;
+                        }
+                        name->mref = le64_to_cpu(ie->data.dir.indexed_file);
+                        name->type = type;
+                        if (type != FILE_NAME_DOS) {
+                                name->len = len;
+                                memcpy(name->name, ie->key.file_name.file_name,
+                                                len * sizeof(ntfschar));
+                        } else
+                                name->len = 0;
+                        *res = name;
+                }
+                /*
+                 * Not a perfect match, need to do full blown collation so we
+                 * know which way in the B+tree we have to go.
+                 */
+                rc = ntfs_collate_names(uname, uname_len,
+                                (ntfschar*)&ie->key.file_name.file_name,
+                                ie->key.file_name.file_name_length, 1,
+                                IGNORE_CASE, vol->upcase, vol->upcase_len);
+                /*
+                 * If uname collates before the name of the current entry, there
+                 * is definitely no such name in this index but we might need to
+                 * descend into the B+tree so we just break out of the loop.
+                 */
+                if (rc == -1)
+                        break;
+                /* The names are not equal, continue the search. */
+                if (rc)
+                        continue;
+                /*
+                 * Names match with case insensitive comparison, now try the
+                 * case sensitive comparison, which is required for proper
+                 * collation.
+                 */
+                rc = ntfs_collate_names(uname, uname_len,
+                                (ntfschar*)&ie->key.file_name.file_name,
+                                ie->key.file_name.file_name_length, 1,
+                                CASE_SENSITIVE, vol->upcase, vol->upcase_len);
+                if (rc == -1)
+                        break;
+                if (rc)
+                        continue;
+                /*
+                 * Perfect match, this will never happen as the
+                 * ntfs_are_names_equal() call will have gotten a match but we
+                 * still treat it correctly.
+                 */
+                goto found_it;
+        }
+        /*
+         * We have finished with this index without success. Check for the
+         * presence of a child node and if not present return -ENOENT, unless
+         * we have got a matching name cached in name in which case return the
+         * mft reference associated with it.
+         */
+        if (!(ie->flags & INDEX_ENTRY_NODE)) {
+                if (name) {
+                        ntfs_attr_put_search_ctx(ctx);
+                        unmap_mft_record(dir_ni);
+                        return name->mref;
+                }
+                ntfs_debug("Entry not found.");
+                err = -ENOENT;
+                goto err_out;
+        } /* Child node present, descend into it. */
+        /* Consistency check: Verify that an index allocation exists. */
+        if (!NInoIndexAllocPresent(dir_ni)) {
+                ntfs_error(sb, "No index allocation attribute but index entry "
+                                "requires one. Directory inode 0x%lx is "
+                                "corrupt or driver bug.", dir_ni->mft_no);
+                goto err_out;
+        }
+        /* Get the starting vcn of the index_block holding the child node. */
+        vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
+        ia_mapping = VFS_I(dir_ni)->i_mapping;
+        /*
+         * We are done with the index root and the mft record. Release them,
+         * otherwise we deadlock with ntfs_map_page().
+         */
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(dir_ni);
+        m = NULL;
+        ctx = NULL;
+descend_into_child_node:
+        /*
+         * Convert vcn to index into the index allocation attribute in units
+         * of PAGE_CACHE_SIZE and map the page cache page, reading it from
+         * disk if necessary.
+         */
+        page = ntfs_map_page(ia_mapping, vcn <<
+                        dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+        if (IS_ERR(page)) {
+                ntfs_error(sb, "Failed to map directory index page, error %ld.",
+                                -PTR_ERR(page));
+                err = PTR_ERR(page);
+                goto err_out;
+        }
+        lock_page(page);
+        kaddr = (u8*)page_address(page);
+fast_descend_into_child_node:
+        /* Get to the index allocation block. */
+        ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
+                        dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+        /* Bounds checks. */
+        if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+                ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
+                                "inode 0x%lx or driver bug.", dir_ni->mft_no);
+                goto unm_err_out;
+        }
+        /* Catch multi sector transfer fixup errors. */
+        if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+                ntfs_error(sb, "Directory index record with vcn 0x%llx is "
+                                "corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
+                                (unsigned long long)vcn, dir_ni->mft_no);
+                goto unm_err_out;
+        }
+        if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
+                ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
+                                "different from expected VCN (0x%llx). "
+                                "Directory inode 0x%lx is corrupt or driver "
+                                "bug.", (unsigned long long)
+                                sle64_to_cpu(ia->index_block_vcn),
+                                (unsigned long long)vcn, dir_ni->mft_no);
+                goto unm_err_out;
+        }
+        if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
+                        dir_ni->itype.index.block_size) {
+                ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+                                "0x%lx has a size (%u) differing from the "
+                                "directory specified size (%u). Directory "
+                                "inode is corrupt or driver bug.",
+                                (unsigned long long)vcn, dir_ni->mft_no,
+                                le32_to_cpu(ia->index.allocated_size) + 0x18,
+                                dir_ni->itype.index.block_size);
+                goto unm_err_out;
+        }
+        index_end = (u8*)ia + dir_ni->itype.index.block_size;
+        if (index_end > kaddr + PAGE_CACHE_SIZE) {
+                ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+                                "0x%lx crosses page boundary. Impossible! "
+                                "Cannot access! This is probably a bug in the "
+                                "driver.", (unsigned long long)vcn,
+                                dir_ni->mft_no);
+                goto unm_err_out;
+        }
+        index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
+        if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
+                ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
+                                "inode 0x%lx exceeds maximum size.",
+                                (unsigned long long)vcn, dir_ni->mft_no);
+                goto unm_err_out;
+        }
+        /* The first index entry. */
+        ie = (INDEX_ENTRY*)((u8*)&ia->index +
+                        le32_to_cpu(ia->index.entries_offset));
+        /*
+         * Iterate similar to above big loop but applied to index buffer, thus
+         * loop until we exceed valid memory (corruption case) or until we
+         * reach the last entry.
+         */
+        for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+                /* Bounds check. */
+                if ((u8*)ie < (u8*)ia || (u8*)ie +
+                                sizeof(INDEX_ENTRY_HEADER) > index_end ||
+                                (u8*)ie + le16_to_cpu(ie->key_length) >
+                                index_end) {
+                        ntfs_error(sb, "Index entry out of bounds in "
+                                        "directory inode 0x%lx.",
+                                        dir_ni->mft_no);
+                        goto unm_err_out;
+                }
+                /*
+                 * The last entry cannot contain a name. It can however contain
+                 * a pointer to a child node in the B+tree so we just break out.
+                 */
+                if (ie->flags & INDEX_ENTRY_END)
+                        break;
+                /*
+                 * We perform a case sensitive comparison and if that matches
+                 * we are done and return the mft reference of the inode (i.e.
+                 * the inode number together with the sequence number for
+                 * consistency checking). We convert it to cpu format before
+                 * returning.
+                 */
+                if (ntfs_are_names_equal(uname, uname_len,
+                                (ntfschar*)&ie->key.file_name.file_name,
+                                ie->key.file_name.file_name_length,
+                                CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
+found_it2:
+                        /*
+                         * We have a perfect match, so we don't need to care
+                         * about having matched imperfectly before, so we can
+                         * free name and set *res to NULL.
+                         * However, if the perfect match is a short file name,
+                         * we need to signal this through *res, so that
+                         * ntfs_lookup() can fix dcache aliasing issues.
+                         * As an optimization we just reuse an existing
+                         * allocation of *res.
+                         */
+                        if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
+                                if (!name) {
+                                        name = kmalloc(sizeof(ntfs_name),
+                                                        GFP_NOFS);
+                                        if (!name) {
+                                                err = -ENOMEM;
+                                                goto unm_err_out;
+                                        }
+                                }
+                                name->mref = le64_to_cpu(
+                                                ie->data.dir.indexed_file);
+                                name->type = FILE_NAME_DOS;
+                                name->len = 0;
+                                *res = name;
+                        } else {
+                                if (name)
+                                        kfree(name);
+                                *res = NULL;
+                        }
+                        mref = le64_to_cpu(ie->data.dir.indexed_file);
+                        unlock_page(page);
+                        ntfs_unmap_page(page);
+                        return mref;
+                }
+                /*
+                 * For a case insensitive mount, we also perform a case
+                 * insensitive comparison (provided the file name is not in the
+                 * POSIX namespace). If the comparison matches, and the name is
+                 * in the WIN32 namespace, we cache the filename in *res so
+                 * that the caller, ntfs_lookup(), can work on it. If the
+                 * comparison matches, and the name is in the DOS namespace, we
+                 * only cache the mft reference and the file name type (we set
+                 * the name length to zero for simplicity).
+                 */
+                if (!NVolCaseSensitive(vol) &&
+                                ie->key.file_name.file_name_type &&
+                                ntfs_are_names_equal(uname, uname_len,
+                                (ntfschar*)&ie->key.file_name.file_name,
+                                ie->key.file_name.file_name_length,
+                                IGNORE_CASE, vol->upcase, vol->upcase_len)) {
+                        int name_size = sizeof(ntfs_name);
+                        u8 type = ie->key.file_name.file_name_type;
+                        u8 len = ie->key.file_name.file_name_length;
+                        /* Only one case insensitive matching name allowed. */
+                        if (name) {
+                                ntfs_error(sb, "Found already allocated name "
+                                                "in phase 2. Please run chkdsk "
+                                                "and if that doesn't find any "
+                                                "errors please report you saw "
+                                                "this message to "
+                                                "linux-ntfs-dev@lists."
+                                                "sourceforge.net.");
+                                unlock_page(page);
+                                ntfs_unmap_page(page);
+                                goto dir_err_out;
+                        }
+                        if (type != FILE_NAME_DOS)
+                                name_size += len * sizeof(ntfschar);
+                        name = kmalloc(name_size, GFP_NOFS);
+                        if (!name) {
+                                err = -ENOMEM;
+                                goto unm_err_out;
+                        }
+                        name->mref = le64_to_cpu(ie->data.dir.indexed_file);
+                        name->type = type;
+                        if (type != FILE_NAME_DOS) {
+                                name->len = len;
+                                memcpy(name->name, ie->key.file_name.file_name,
+                                                len * sizeof(ntfschar));
+                        } else
+                                name->len = 0;
+                        *res = name;
+                }
+                /*
+                 * Not a perfect match, need to do full blown collation so we
+                 * know which way in the B+tree we have to go.
+                 */
+                rc = ntfs_collate_names(uname, uname_len,
+                                (ntfschar*)&ie->key.file_name.file_name,
+                                ie->key.file_name.file_name_length, 1,
+                                IGNORE_CASE, vol->upcase, vol->upcase_len);
+                /*
+                 * If uname collates before the name of the current entry, there
+                 * is definitely no such name in this index but we might need to
+                 * descend into the B+tree so we just break out of the loop.
+                 */
+                if (rc == -1)
+                        break;
+                /* The names are not equal, continue the search. */
+                if (rc)
+                        continue;
+                /*
+                 * Names match with case insensitive comparison, now try the
+                 * case sensitive comparison, which is required for proper
+                 * collation.
+                 */
+                rc = ntfs_collate_names(uname, uname_len,
+                                (ntfschar*)&ie->key.file_name.file_name,
+                                ie->key.file_name.file_name_length, 1,
+                                CASE_SENSITIVE, vol->upcase, vol->upcase_len);
+                if (rc == -1)
+                        break;
+                if (rc)
+                        continue;
+                /*
+                 * Perfect match, this will never happen as the
+                 * ntfs_are_names_equal() call will have gotten a match but we
+                 * still treat it correctly.
+                 */
+                goto found_it2;
+        }
+        /*
+         * We have finished with this index buffer without success. Check for
+         * the presence of a child node.
+         */
+        if (ie->flags & INDEX_ENTRY_NODE) {
+                if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
+                        ntfs_error(sb, "Index entry with child node found in "
+                                        "a leaf node in directory inode 0x%lx.",
+                                        dir_ni->mft_no);
+                        goto unm_err_out;
+                }
+                /* Child node present, descend into it. */
+                old_vcn = vcn;
+                vcn = sle64_to_cpup((sle64*)((u8*)ie +
+                                le16_to_cpu(ie->length) - 8));
+                if (vcn >= 0) {
+                        /* If vcn is in the same page cache page as old_vcn we
+                         * recycle the mapped page. */
+                        if (old_vcn << vol->cluster_size_bits >>
+                                        PAGE_CACHE_SHIFT == vcn <<
+                                        vol->cluster_size_bits >>
+                                        PAGE_CACHE_SHIFT)
+                                goto fast_descend_into_child_node;
+                        unlock_page(page);
+                        ntfs_unmap_page(page);
+                        goto descend_into_child_node;
+                }
+                ntfs_error(sb, "Negative child node vcn in directory inode "
+                                "0x%lx.", dir_ni->mft_no);
+                goto unm_err_out;
+        }
+        /*
+         * No child node present, return -ENOENT, unless we have got a matching
+         * name cached in name in which case return the mft reference
+         * associated with it.
+         */
+        if (name) {
+                unlock_page(page);
+                ntfs_unmap_page(page);
+                return name->mref;
+        }
+        ntfs_debug("Entry not found.");
+        err = -ENOENT;
+unm_err_out:
+        unlock_page(page);
+        ntfs_unmap_page(page);
+err_out:
+        if (!err)
+                err = -EIO;
+        if (ctx)
+                ntfs_attr_put_search_ctx(ctx);
+        if (m)
+                unmap_mft_record(dir_ni);
+        if (name) {
+                kfree(name);
+                *res = NULL;
+        }
+        return ERR_MREF(err);
+dir_err_out:
+        ntfs_error(sb, "Corrupt directory.  Aborting lookup.");
+        goto err_out;
+}
+#if 0
+// TODO: (AIA)
+// The algorithm embedded in this code will be required for the time when we
+// want to support adding of entries to directories, where we require correct
+// collation of file names in order not to cause corruption of the file system.
+/**
+ * ntfs_lookup_inode_by_name - find an inode in a directory given its name
+ * @dir_ni:     ntfs inode of the directory in which to search for the name
+ * @uname:      Unicode name for which to search in the directory
+ * @uname_len:  length of the name @uname in Unicode characters
+ *
+ * Look for an inode with name @uname in the directory with inode @dir_ni.
+ * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
+ * the Unicode name. If the name is found in the directory, the corresponding
+ * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
+ * is a 64-bit number containing the sequence number.
+ *
+ * On error, a negative value is returned corresponding to the error code. In
+ * particular if the inode is not found -ENOENT is returned. Note that you
+ * can't just check the return value for being negative, you have to check the
+ * inode number for being negative which you can extract using MREC(return
+ * value).
+ *
+ * Note, @uname_len does not include the (optional) terminating NULL character.
+ */
+u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
+                const int uname_len)
+{
+        ntfs_volume *vol = dir_ni->vol;
+        struct super_block *sb = vol->sb;
+        MFT_RECORD *m;
+        INDEX_ROOT *ir;
+        INDEX_ENTRY *ie;
+        INDEX_ALLOCATION *ia;
+        u8 *index_end;
+        u64 mref;
+        ntfs_attr_search_ctx *ctx;
+        int err, rc;
+        IGNORE_CASE_BOOL ic;
+        VCN vcn, old_vcn;
+        struct address_space *ia_mapping;
+        struct page *page;
+        u8 *kaddr;
+        /* Get hold of the mft record for the directory. */
+        m = map_mft_record(dir_ni);
+        if (IS_ERR(m)) {
+                ntfs_error(sb, "map_mft_record() failed with error code %ld.",
+                                -PTR_ERR(m));
+                return ERR_MREF(PTR_ERR(m));
+        }
+        ctx = ntfs_attr_get_search_ctx(dir_ni, m);
+        if (!ctx) {
+                err = -ENOMEM;
+                goto err_out;
+        }
+        /* Find the index root attribute in the mft record. */
+        err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
+                        0, ctx);
+        if (unlikely(err)) {
+                if (err == -ENOENT) {
+                        ntfs_error(sb, "Index root attribute missing in "
+                                        "directory inode 0x%lx.",
+                                        dir_ni->mft_no);
+                        err = -EIO;
+                }
+                goto err_out;
+        }
+        /* Get to the index root value (it's been verified in read_inode). */
+        ir = (INDEX_ROOT*)((u8*)ctx->attr +
+                        le16_to_cpu(ctx->attr->data.resident.value_offset));
+        index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+        /* The first index entry. */
+        ie = (INDEX_ENTRY*)((u8*)&ir->index +
+                        le32_to_cpu(ir->index.entries_offset));
+        /*
+         * Loop until we exceed valid memory (corruption case) or until we
+         * reach the last entry.
+         */
+        for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+                /* Bounds checks. */
+                if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
+                                sizeof(INDEX_ENTRY_HEADER) > index_end ||
+                                (u8*)ie + le16_to_cpu(ie->key_length) >
+                                index_end)
+                        goto dir_err_out;
+                /*
+                 * The last entry cannot contain a name. It can however contain
+                 * a pointer to a child node in the B+tree so we just break out.
+                 */
+                if (ie->flags & INDEX_ENTRY_END)
+                        break;
+                /*
+                 * If the current entry has a name type of POSIX, the name is
+                 * case sensitive and not otherwise. This has the effect of us
+                 * not being able to access any POSIX file names which collate
+                 * after the non-POSIX one when they only differ in case, but
+                 * anyone doing screwy stuff like that deserves to burn in
+                 * hell... Doing that kind of stuff on NT4 actually causes
+                 * corruption on the partition even when using SP6a and Linux
+                 * is not involved at all.
+                 */
+                ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
+                                CASE_SENSITIVE;
+                /*
+                 * If the names match perfectly, we are done and return the
+                 * mft reference of the inode (i.e. the inode number together
+                 * with the sequence number for consistency checking. We
+                 * convert it to cpu format before returning.
+                 */
+                if (ntfs_are_names_equal(uname, uname_len,
+                                (ntfschar*)&ie->key.file_name.file_name,
+                                ie->key.file_name.file_name_length, ic,
+                                vol->upcase, vol->upcase_len)) {
+found_it:
+                        mref = le64_to_cpu(ie->data.dir.indexed_file);
+                        ntfs_attr_put_search_ctx(ctx);
+                        unmap_mft_record(dir_ni);
+                        return mref;
+                }
+                /*
+                 * Not a perfect match, need to do full blown collation so we
+                 * know which way in the B+tree we have to go.
+                 */
+                rc = ntfs_collate_names(uname, uname_len,
+                                (ntfschar*)&ie->key.file_name.file_name,
+                                ie->key.file_name.file_name_length, 1,
+                                IGNORE_CASE, vol->upcase, vol->upcase_len);
+                /*
+                 * If uname collates before the name of the current entry, there
+                 * is definitely no such name in this index but we might need to
+                 * descend into the B+tree so we just break out of the loop.
+                 */
+                if (rc == -1)
+                        break;
+                /* The names are not equal, continue the search. */
+                if (rc)
+                        continue;
+                /*
+                 * Names match with case insensitive comparison, now try the
+                 * case sensitive comparison, which is required for proper
+                 * collation.
+                 */
+                rc = ntfs_collate_names(uname, uname_len,
+                                (ntfschar*)&ie->key.file_name.file_name,
+                                ie->key.file_name.file_name_length, 1,
+                                CASE_SENSITIVE, vol->upcase, vol->upcase_len);
+                if (rc == -1)
+                        break;
+                if (rc)
+                        continue;
+                /*
+                 * Perfect match, this will never happen as the
+                 * ntfs_are_names_equal() call will have gotten a match but we
+                 * still treat it correctly.
+                 */
+                goto found_it;
+        }
+        /*
+         * We have finished with this index without success. Check for the
+         * presence of a child node.
+         */
+        if (!(ie->flags & INDEX_ENTRY_NODE)) {
+                /* No child node, return -ENOENT. */
+                err = -ENOENT;
+                goto err_out;
+        } /* Child node present, descend into it. */
+        /* Consistency check: Verify that an index allocation exists. */
+        if (!NInoIndexAllocPresent(dir_ni)) {
+                ntfs_error(sb, "No index allocation attribute but index entry "
+                                "requires one. Directory inode 0x%lx is "
+                                "corrupt or driver bug.", dir_ni->mft_no);
+                goto err_out;
+        }
+        /* Get the starting vcn of the index_block holding the child node. */
+        vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
+        ia_mapping = VFS_I(dir_ni)->i_mapping;
+        /*
+         * We are done with the index root and the mft record. Release them,
+         * otherwise we deadlock with ntfs_map_page().
+         */
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(dir_ni);
+        m = NULL;
+        ctx = NULL;
+descend_into_child_node:
+        /*
+         * Convert vcn to index into the index allocation attribute in units
+         * of PAGE_CACHE_SIZE and map the page cache page, reading it from
+         * disk if necessary.
+         */
+        page = ntfs_map_page(ia_mapping, vcn <<
+                        dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+        if (IS_ERR(page)) {
+                ntfs_error(sb, "Failed to map directory index page, error %ld.",
+                                -PTR_ERR(page));
+                err = PTR_ERR(page);
+                goto err_out;
+        }
+        lock_page(page);
+        kaddr = (u8*)page_address(page);
+fast_descend_into_child_node:
+        /* Get to the index allocation block. */
+        ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
+                        dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+        /* Bounds checks. */
+        if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+                ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
+                                "inode 0x%lx or driver bug.", dir_ni->mft_no);
+                goto unm_err_out;
+        }
+        /* Catch multi sector transfer fixup errors. */
+        if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+                ntfs_error(sb, "Directory index record with vcn 0x%llx is "
+                                "corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
+                                (unsigned long long)vcn, dir_ni->mft_no);
+                goto unm_err_out;
+        }
+        if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
+                ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
+                                "different from expected VCN (0x%llx). "
+                                "Directory inode 0x%lx is corrupt or driver "
+                                "bug.", (unsigned long long)
+                                sle64_to_cpu(ia->index_block_vcn),
+                                (unsigned long long)vcn, dir_ni->mft_no);
+                goto unm_err_out;
+        }
+        if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
+                        dir_ni->itype.index.block_size) {
+                ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+                                "0x%lx has a size (%u) differing from the "
+                                "directory specified size (%u). Directory "
+                                "inode is corrupt or driver bug.",
+                                (unsigned long long)vcn, dir_ni->mft_no,
+                                le32_to_cpu(ia->index.allocated_size) + 0x18,
+                                dir_ni->itype.index.block_size);
+                goto unm_err_out;
+        }
+        index_end = (u8*)ia + dir_ni->itype.index.block_size;
+        if (index_end > kaddr + PAGE_CACHE_SIZE) {
+                ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+                                "0x%lx crosses page boundary. Impossible! "
+                                "Cannot access! This is probably a bug in the "
+                                "driver.", (unsigned long long)vcn,
+                                dir_ni->mft_no);
+                goto unm_err_out;
+        }
+        index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
+        if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
+                ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
+                                "inode 0x%lx exceeds maximum size.",
+                                (unsigned long long)vcn, dir_ni->mft_no);
+                goto unm_err_out;
+        }
+        /* The first index entry. */
+        ie = (INDEX_ENTRY*)((u8*)&ia->index +
+                        le32_to_cpu(ia->index.entries_offset));
+        /*
+         * Iterate similar to above big loop but applied to index buffer, thus
+         * loop until we exceed valid memory (corruption case) or until we
+         * reach the last entry.
+         */
+        for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+                /* Bounds check. */
+                if ((u8*)ie < (u8*)ia || (u8*)ie +
+                                sizeof(INDEX_ENTRY_HEADER) > index_end ||
+                                (u8*)ie + le16_to_cpu(ie->key_length) >
+                                index_end) {
+                        ntfs_error(sb, "Index entry out of bounds in "
+                                        "directory inode 0x%lx.",
+                                        dir_ni->mft_no);
+                        goto unm_err_out;
+                }
+                /*
+                 * The last entry cannot contain a name. It can however contain
+                 * a pointer to a child node in the B+tree so we just break out.
+                 */
+                if (ie->flags & INDEX_ENTRY_END)
+                        break;
+                /*
+                 * If the current entry has a name type of POSIX, the name is
+                 * case sensitive and not otherwise. This has the effect of us
+                 * not being able to access any POSIX file names which collate
+                 * after the non-POSIX one when they only differ in case, but
+                 * anyone doing screwy stuff like that deserves to burn in
+                 * hell... Doing that kind of stuff on NT4 actually causes
+                 * corruption on the partition even when using SP6a and Linux
+                 * is not involved at all.
+                 */
+                ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
+                                CASE_SENSITIVE;
+                /*
+                 * If the names match perfectly, we are done and return the
+                 * mft reference of the inode (i.e. the inode number together
+                 * with the sequence number for consistency checking. We
+                 * convert it to cpu format before returning.
+                 */
+                if (ntfs_are_names_equal(uname, uname_len,
+                                (ntfschar*)&ie->key.file_name.file_name,
+                                ie->key.file_name.file_name_length, ic,
+                                vol->upcase, vol->upcase_len)) {
+found_it2:
+                        mref = le64_to_cpu(ie->data.dir.indexed_file);
+                        unlock_page(page);
+                        ntfs_unmap_page(page);
+                        return mref;
+                }
+                /*
+                 * Not a perfect match, need to do full blown collation so we
+                 * know which way in the B+tree we have to go.
+                 */
+                rc = ntfs_collate_names(uname, uname_len,
+                                (ntfschar*)&ie->key.file_name.file_name,
+                                ie->key.file_name.file_name_length, 1,
+                                IGNORE_CASE, vol->upcase, vol->upcase_len);
+                /*
+                 * If uname collates before the name of the current entry, there
+                 * is definitely no such name in this index but we might need to
+                 * descend into the B+tree so we just break out of the loop.
+                 */
+                if (rc == -1)
+                        break;
+                /* The names are not equal, continue the search. */
+                if (rc)
+                        continue;
+                /*
+                 * Names match with case insensitive comparison, now try the
+                 * case sensitive comparison, which is required for proper
+                 * collation.
+                 */
+                rc = ntfs_collate_names(uname, uname_len,
+                                (ntfschar*)&ie->key.file_name.file_name,
+                                ie->key.file_name.file_name_length, 1,
+                                CASE_SENSITIVE, vol->upcase, vol->upcase_len);
+                if (rc == -1)
+                        break;
+                if (rc)
+                        continue;
+                /*
+                 * Perfect match, this will never happen as the
+                 * ntfs_are_names_equal() call will have gotten a match but we
+                 * still treat it correctly.
+                 */
+                goto found_it2;
+        }
+        /*
+         * We have finished with this index buffer without success. Check for
+         * the presence of a child node.
+         */
+        if (ie->flags & INDEX_ENTRY_NODE) {
+                if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
+                        ntfs_error(sb, "Index entry with child node found in "
+                                        "a leaf node in directory inode 0x%lx.",
+                                        dir_ni->mft_no);
+                        goto unm_err_out;
+                }
+                /* Child node present, descend into it. */
+                old_vcn = vcn;
+                vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
+                if (vcn >= 0) {
+                        /* If vcn is in the same page cache page as old_vcn we
+                         * recycle the mapped page. */
+                        if (old_vcn << vol->cluster_size_bits >>
+                                        PAGE_CACHE_SHIFT == vcn <<
+                                        vol->cluster_size_bits >>
+                                        PAGE_CACHE_SHIFT)
+                                goto fast_descend_into_child_node;
+                        unlock_page(page);
+                        ntfs_unmap_page(page);
+                        goto descend_into_child_node;
+                }
+                ntfs_error(sb, "Negative child node vcn in directory inode "
+                                "0x%lx.", dir_ni->mft_no);
+                goto unm_err_out;
+        }
+        /* No child node, return -ENOENT. */
+        ntfs_debug("Entry not found.");
+        err = -ENOENT;
+unm_err_out:
+        unlock_page(page);
+        ntfs_unmap_page(page);
+err_out:
+        if (!err)
+                err = -EIO;
+        if (ctx)
+                ntfs_attr_put_search_ctx(ctx);
+        if (m)
+                unmap_mft_record(dir_ni);
+        return ERR_MREF(err);
+dir_err_out:
+        ntfs_error(sb, "Corrupt directory. Aborting lookup.");
+        goto err_out;
+}
+#endif
+/**
+ * ntfs_filldir - ntfs specific filldir method
+ * @vol:        current ntfs volume
+ * @fpos:       position in the directory
+ * @ndir:       ntfs inode of current directory
+ * @ia_page:    page in which the index allocation buffer @ie is in resides
+ * @ie:         current index entry
+ * @name:       buffer to use for the converted name
+ * @dirent:     vfs filldir callback context
+ * @filldir:    vfs filldir callback
+ *
+ * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
+ * callback.
+ *
+ * If @ia_page is not NULL it is the locked page containing the index
+ * allocation block containing the index entry @ie.
+ *
+ * Note, we drop (and then reacquire) the page lock on @ia_page across the
+ * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup
+ * since ntfs_lookup() will lock the same page.  As an optimization, we do not
+ * retake the lock if we are returning a non-zero value as ntfs_readdir()
+ * would need to drop the lock immediately anyway.
+ */
+static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
+                ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
+                u8 *name, void *dirent, filldir_t filldir)
+{
+        unsigned long mref;
+        int name_len, rc;
+        unsigned dt_type;
+        FILE_NAME_TYPE_FLAGS name_type;
+        name_type = ie->key.file_name.file_name_type;
+        if (name_type == FILE_NAME_DOS) {
+                ntfs_debug("Skipping DOS name space entry.");
+                return 0;
+        }
+        if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) {
+                ntfs_debug("Skipping root directory self reference entry.");
+                return 0;
+        }
+        if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user &&
+                        !NVolShowSystemFiles(vol)) {
+                ntfs_debug("Skipping system file.");
+                return 0;
+        }
+        name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name,
+                        ie->key.file_name.file_name_length, &name,
+                        NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1);
+        if (name_len <= 0) {
+                ntfs_debug("Skipping unrepresentable file.");
+                return 0;
+        }
+        if (ie->key.file_name.file_attributes &
+                        FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT)
+                dt_type = DT_DIR;
+        else
+                dt_type = DT_REG;
+        mref = MREF_LE(ie->data.dir.indexed_file);
+        /*
+         * Drop the page lock otherwise we deadlock with NFS when it calls
+         * ->lookup since ntfs_lookup() will lock the same page.
+         */
+        if (ia_page)
+                unlock_page(ia_page);
+        ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
+                        "0x%lx, DT_%s.", name, name_len, fpos, mref,
+                        dt_type == DT_DIR ? "DIR" : "REG");
+        rc = filldir(dirent, name, name_len, fpos, mref, dt_type);
+        /* Relock the page but not if we are aborting ->readdir. */
+        if (!rc && ia_page)
+                lock_page(ia_page);
+        return rc;
+}
+/*
+ * We use the same basic approach as the old NTFS driver, i.e. we parse the
+ * index root entries and then the index allocation entries that are marked
+ * as in use in the index bitmap.
+ *
+ * While this will return the names in random order this doesn't matter for
+ * ->readdir but OTOH results in a faster ->readdir.
+ *
+ * VFS calls ->readdir without BKL but with i_sem held. This protects the VFS
+ * parts (e.g. ->f_pos and ->i_size, and it also protects against directory
+ * modifications).
+ *
+ * Locking:  - Caller must hold i_sem on the directory.
+ *           - Each page cache page in the index allocation mapping must be
+ *             locked whilst being accessed otherwise we may find a corrupt
+ *             page due to it being under ->writepage at the moment which
+ *             applies the mst protection fixups before writing out and then
+ *             removes them again after the write is complete after which it 
+ *             unlocks the page.
+ */
+static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
+        loff_t fpos;
+        struct inode *bmp_vi, *vdir = filp->f_dentry->d_inode;
+        struct super_block *sb = vdir->i_sb;
+        ntfs_inode *ndir = NTFS_I(vdir);
+        ntfs_volume *vol = NTFS_SB(sb);
+        MFT_RECORD *m;
+        INDEX_ROOT *ir = NULL;
+        INDEX_ENTRY *ie;
+        INDEX_ALLOCATION *ia;
+        u8 *name = NULL;
+        int rc, err, ir_pos, cur_bmp_pos;
+        struct address_space *ia_mapping, *bmp_mapping;
+        struct page *bmp_page = NULL, *ia_page = NULL;
+        u8 *kaddr, *bmp, *index_end;
+        ntfs_attr_search_ctx *ctx;
+        fpos = filp->f_pos;
+        ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
+                        vdir->i_ino, fpos);
+        rc = err = 0;
+        /* Are we at end of dir yet? */
+        if (fpos >= vdir->i_size + vol->mft_record_size)
+                goto done;
+        /* Emulate . and .. for all directories. */
+        if (!fpos) {
+                ntfs_debug("Calling filldir for . with len 1, fpos 0x0, "
+                                "inode 0x%lx, DT_DIR.", vdir->i_ino);
+                rc = filldir(dirent, ".", 1, fpos, vdir->i_ino, DT_DIR);
+                if (rc)
+                        goto done;
+                fpos++;
+        }
+        if (fpos == 1) {
+                ntfs_debug("Calling filldir for .. with len 2, fpos 0x1, "
+                                "inode 0x%lx, DT_DIR.",
+                                parent_ino(filp->f_dentry));
+                rc = filldir(dirent, "..", 2, fpos,
+                                parent_ino(filp->f_dentry), DT_DIR);
+                if (rc)
+                        goto done;
+                fpos++;
+        }
+        m = NULL;
+        ctx = NULL;
+        /*
+         * Allocate a buffer to store the current name being processed
+         * converted to format determined by current NLS.
+         */
+        name = (u8*)kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1,
+                        GFP_NOFS);
+        if (unlikely(!name)) {
+                err = -ENOMEM;
+                goto err_out;
+        }
+        /* Are we jumping straight into the index allocation attribute? */
+        if (fpos >= vol->mft_record_size)
+                goto skip_index_root;
+        /* Get hold of the mft record for the directory. */
+        m = map_mft_record(ndir);
+        if (IS_ERR(m)) {
+                err = PTR_ERR(m);
+                m = NULL;
+                goto err_out;
+        }
+        ctx = ntfs_attr_get_search_ctx(ndir, m);
+        if (unlikely(!ctx)) {
+                err = -ENOMEM;
+                goto err_out;
+        }
+        /* Get the offset into the index root attribute. */
+        ir_pos = (s64)fpos;
+        /* Find the index root attribute in the mft record. */
+        err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
+                        0, ctx);
+        if (unlikely(err)) {
+                ntfs_error(sb, "Index root attribute missing in directory "
+                                "inode 0x%lx.", vdir->i_ino);
+                goto err_out;
+        }
+        /*
+         * Copy the index root attribute value to a buffer so that we can put
+         * the search context and unmap the mft record before calling the
+         * filldir() callback.  We need to do this because of NFSd which calls
+         * ->lookup() from its filldir callback() and this causes NTFS to
+         * deadlock as ntfs_lookup() maps the mft record of the directory and
+         * we have got it mapped here already.  The only solution is for us to
+         * unmap the mft record here so that a call to ntfs_lookup() is able to
+         * map the mft record without deadlocking.
+         */
+        rc = le32_to_cpu(ctx->attr->data.resident.value_length);
+        ir = (INDEX_ROOT*)kmalloc(rc, GFP_NOFS);
+        if (unlikely(!ir)) {
+                err = -ENOMEM;
+                goto err_out;
+        }
+        /* Copy the index root value (it has been verified in read_inode). */
+        memcpy(ir, (u8*)ctx->attr +
+                        le16_to_cpu(ctx->attr->data.resident.value_offset), rc);
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(ndir);
+        ctx = NULL;
+        m = NULL;
+        index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+        /* The first index entry. */
+        ie = (INDEX_ENTRY*)((u8*)&ir->index +
+                        le32_to_cpu(ir->index.entries_offset));
+        /*
+         * Loop until we exceed valid memory (corruption case) or until we
+         * reach the last entry or until filldir tells us it has had enough
+         * or signals an error (both covered by the rc test).
+         */
+        for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+                ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir);
+                /* Bounds checks. */
+                if (unlikely((u8*)ie < (u8*)ir || (u8*)ie +
+                                sizeof(INDEX_ENTRY_HEADER) > index_end ||
+                                (u8*)ie + le16_to_cpu(ie->key_length) >
+                                index_end))
+                        goto err_out;
+                /* The last entry cannot contain a name. */
+                if (ie->flags & INDEX_ENTRY_END)
+                        break;
+                /* Skip index root entry if continuing previous readdir. */
+                if (ir_pos > (u8*)ie - (u8*)ir)
+                        continue;
+                /* Advance the position even if going to skip the entry. */
+                fpos = (u8*)ie - (u8*)ir;
+                /* Submit the name to the filldir callback. */
+                rc = ntfs_filldir(vol, fpos, ndir, NULL, ie, name, dirent,
+                                filldir);
+                if (rc) {
+                        kfree(ir);
+                        goto abort;
+                }
+        }
+        /* We are done with the index root and can free the buffer. */
+        kfree(ir);
+        ir = NULL;
+        /* If there is no index allocation attribute we are finished. */
+        if (!NInoIndexAllocPresent(ndir))
+                goto EOD;
+        /* Advance fpos to the beginning of the index allocation. */
+        fpos = vol->mft_record_size;
+skip_index_root:
+        kaddr = NULL;
+        prev_ia_pos = -1LL;
+        /* Get the offset into the index allocation attribute. */
+        ia_pos = (s64)fpos - vol->mft_record_size;
+        ia_mapping = vdir->i_mapping;
+        bmp_vi = ndir->itype.index.bmp_ino;
+        if (unlikely(!bmp_vi)) {
+                ntfs_debug("Inode 0x%lx, regetting index bitmap.", vdir->i_ino);
+                bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
+                if (IS_ERR(bmp_vi)) {
+                        ntfs_error(sb, "Failed to get bitmap attribute.");
+                        err = PTR_ERR(bmp_vi);
+                        goto err_out;
+                }
+                ndir->itype.index.bmp_ino = bmp_vi;
+        }
+        bmp_mapping = bmp_vi->i_mapping;
+        /* Get the starting bitmap bit position and sanity check it. */
+        bmp_pos = ia_pos >> ndir->itype.index.block_size_bits;
+        if (unlikely(bmp_pos >> 3 >= bmp_vi->i_size)) {
+                ntfs_error(sb, "Current index allocation position exceeds "
+                                "index bitmap size.");
+                goto err_out;
+        }
+        /* Get the starting bit position in the current bitmap page. */
+        cur_bmp_pos = bmp_pos & ((PAGE_CACHE_SIZE * 8) - 1);
+        bmp_pos &= ~(u64)((PAGE_CACHE_SIZE * 8) - 1);
+get_next_bmp_page:
+        ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx",
+                        (unsigned long long)bmp_pos >> (3 + PAGE_CACHE_SHIFT),
+                        (unsigned long long)bmp_pos &
+                        (unsigned long long)((PAGE_CACHE_SIZE * 8) - 1));
+        bmp_page = ntfs_map_page(bmp_mapping,
+                        bmp_pos >> (3 + PAGE_CACHE_SHIFT));
+        if (IS_ERR(bmp_page)) {
+                ntfs_error(sb, "Reading index bitmap failed.");
+                err = PTR_ERR(bmp_page);
+                bmp_page = NULL;
+                goto err_out;
+        }
+        bmp = (u8*)page_address(bmp_page);
+        /* Find next index block in use. */
+        while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) {
+find_next_index_buffer:
+                cur_bmp_pos++;
+                /*
+                 * If we have reached the end of the bitmap page, get the next
+                 * page, and put away the old one.
+                 */
+                if (unlikely((cur_bmp_pos >> 3) >= PAGE_CACHE_SIZE)) {
+                        ntfs_unmap_page(bmp_page);
+                        bmp_pos += PAGE_CACHE_SIZE * 8;
+                        cur_bmp_pos = 0;
+                        goto get_next_bmp_page;
+                }
+                /* If we have reached the end of the bitmap, we are done. */
+                if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= vdir->i_size))
+                        goto unm_EOD;
+                ia_pos = (bmp_pos + cur_bmp_pos) <<
+                                ndir->itype.index.block_size_bits;
+        }
+        ntfs_debug("Handling index buffer 0x%llx.",
+                        (unsigned long long)bmp_pos + cur_bmp_pos);
+        /* If the current index buffer is in the same page we reuse the page. */
+        if ((prev_ia_pos & PAGE_CACHE_MASK) != (ia_pos & PAGE_CACHE_MASK)) {
+                prev_ia_pos = ia_pos;
+                if (likely(ia_page != NULL)) {
+                        unlock_page(ia_page);
+                        ntfs_unmap_page(ia_page);
+                }
+                /*
+                 * Map the page cache page containing the current ia_pos,
+                 * reading it from disk if necessary.
+                 */
+                ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_CACHE_SHIFT);
+                if (IS_ERR(ia_page)) {
+                        ntfs_error(sb, "Reading index allocation data failed.");
+                        err = PTR_ERR(ia_page);
+                        ia_page = NULL;
+                        goto err_out;
+                }
+                lock_page(ia_page);
+                kaddr = (u8*)page_address(ia_page);
+        }
+        /* Get the current index buffer. */
+        ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_CACHE_MASK &
+                        ~(s64)(ndir->itype.index.block_size - 1)));
+        /* Bounds checks. */
+        if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE)) {
+                ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
+                                "inode 0x%lx or driver bug.", vdir->i_ino);
+                goto err_out;
+        }
+        /* Catch multi sector transfer fixup errors. */
+        if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+                ntfs_error(sb, "Directory index record with vcn 0x%llx is "
+                                "corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
+                                (unsigned long long)ia_pos >>
+                                ndir->itype.index.vcn_size_bits, vdir->i_ino);
+                goto err_out;
+        }
+        if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos &
+                        ~(s64)(ndir->itype.index.block_size - 1)) >>
+                        ndir->itype.index.vcn_size_bits)) {
+                ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
+                                "different from expected VCN (0x%llx). "
+                                "Directory inode 0x%lx is corrupt or driver "
+                                "bug. ", (unsigned long long)
+                                sle64_to_cpu(ia->index_block_vcn),
+                                (unsigned long long)ia_pos >>
+                                ndir->itype.index.vcn_size_bits, vdir->i_ino);
+                goto err_out;
+        }
+        if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 !=
+                        ndir->itype.index.block_size)) {
+                ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+                                "0x%lx has a size (%u) differing from the "
+                                "directory specified size (%u). Directory "
+                                "inode is corrupt or driver bug.",
+                                (unsigned long long)ia_pos >>
+                                ndir->itype.index.vcn_size_bits, vdir->i_ino,
+                                le32_to_cpu(ia->index.allocated_size) + 0x18,
+                                ndir->itype.index.block_size);
+                goto err_out;
+        }
+        index_end = (u8*)ia + ndir->itype.index.block_size;
+        if (unlikely(index_end > kaddr + PAGE_CACHE_SIZE)) {
+                ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+                                "0x%lx crosses page boundary. Impossible! "
+                                "Cannot access! This is probably a bug in the "
+                                "driver.", (unsigned long long)ia_pos >>
+                                ndir->itype.index.vcn_size_bits, vdir->i_ino);
+                goto err_out;
+        }
+        ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1);
+        index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
+        if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) {
+                ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
+                                "inode 0x%lx exceeds maximum size.",
+                                (unsigned long long)ia_pos >>
+                                ndir->itype.index.vcn_size_bits, vdir->i_ino);
+                goto err_out;
+        }
+        /* The first index entry in this index buffer. */
+        ie = (INDEX_ENTRY*)((u8*)&ia->index +
+                        le32_to_cpu(ia->index.entries_offset));
+        /*
+         * Loop until we exceed valid memory (corruption case) or until we
+         * reach the last entry or until filldir tells us it has had enough
+         * or signals an error (both covered by the rc test).
+         */
+        for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+                ntfs_debug("In index allocation, offset 0x%llx.",
+                                (unsigned long long)ia_start +
+                                (unsigned long long)((u8*)ie - (u8*)ia));
+                /* Bounds checks. */
+                if (unlikely((u8*)ie < (u8*)ia || (u8*)ie +
+                                sizeof(INDEX_ENTRY_HEADER) > index_end ||
+                                (u8*)ie + le16_to_cpu(ie->key_length) >
+                                index_end))
+                        goto err_out;
+                /* The last entry cannot contain a name. */
+                if (ie->flags & INDEX_ENTRY_END)
+                        break;
+                /* Skip index block entry if continuing previous readdir. */
+                if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
+                        continue;
+                /* Advance the position even if going to skip the entry. */
+                fpos = (u8*)ie - (u8*)ia +
+                                (sle64_to_cpu(ia->index_block_vcn) <<
+                                ndir->itype.index.vcn_size_bits) +
+                                vol->mft_record_size;
+                /*
+                 * Submit the name to the @filldir callback.  Note,
+                 * ntfs_filldir() drops the lock on @ia_page but it retakes it
+                 * before returning, unless a non-zero value is returned in
+                 * which case the page is left unlocked.
+                 */
+                rc = ntfs_filldir(vol, fpos, ndir, ia_page, ie, name, dirent,
+                                filldir);
+                if (rc) {
+                        /* @ia_page is already unlocked in this case. */
+                        ntfs_unmap_page(ia_page);
+                        ntfs_unmap_page(bmp_page);
+                        goto abort;
+                }
+        }
+        goto find_next_index_buffer;
+unm_EOD:
+        if (ia_page) {
+                unlock_page(ia_page);
+                ntfs_unmap_page(ia_page);
+        }
+        ntfs_unmap_page(bmp_page);
+EOD:
+        /* We are finished, set fpos to EOD. */
+        fpos = vdir->i_size + vol->mft_record_size;
+abort:
+        kfree(name);
+done:
+#ifdef DEBUG
+        if (!rc)
+                ntfs_debug("EOD, fpos 0x%llx, returning 0.", fpos);
+        else
+                ntfs_debug("filldir returned %i, fpos 0x%llx, returning 0.",
+                                rc, fpos);
+#endif
+        filp->f_pos = fpos;
+        return 0;
+err_out:
+        if (bmp_page)
+                ntfs_unmap_page(bmp_page);
+        if (ia_page) {
+                unlock_page(ia_page);
+                ntfs_unmap_page(ia_page);
+        }
+        if (ir)
+                kfree(ir);
+        if (name)
+                kfree(name);
+        if (ctx)
+                ntfs_attr_put_search_ctx(ctx);
+        if (m)
+                unmap_mft_record(ndir);
+        if (!err)
+                err = -EIO;
+        ntfs_debug("Failed. Returning error code %i.", -err);
+        filp->f_pos = fpos;
+        return err;
+}
+/**
+ * ntfs_dir_open - called when an inode is about to be opened
+ * @vi:         inode to be opened
+ * @filp:       file structure describing the inode
+ *
+ * Limit directory size to the page cache limit on architectures where unsigned
+ * long is 32-bits. This is the most we can do for now without overflowing the
+ * page cache page index. Doing it this way means we don't run into problems
+ * because of existing too large directories. It would be better to allow the
+ * user to read the accessible part of the directory but I doubt very much
+ * anyone is going to hit this check on a 32-bit architecture, so there is no
+ * point in adding the extra complexity required to support this.
+ *
+ * On 64-bit architectures, the check is hopefully optimized away by the
+ * compiler.
+ */
+static int ntfs_dir_open(struct inode *vi, struct file *filp)
+{
+        if (sizeof(unsigned long) < 8) {
+                if (vi->i_size > MAX_LFS_FILESIZE)
+                        return -EFBIG;
+        }
+        return 0;
+}
+#ifdef NTFS_RW
+/**
+ * ntfs_dir_fsync - sync a directory to disk
+ * @filp:       directory to be synced
+ * @dentry:     dentry describing the directory to sync
+ * @datasync:   if non-zero only flush user data and not metadata
+ *
+ * Data integrity sync of a directory to disk.  Used for fsync, fdatasync, and
+ * msync system calls.  This function is based on file.c::ntfs_file_fsync().
+ *
+ * Write the mft record and all associated extent mft records as well as the
+ * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device.
+ *
+ * If @datasync is true, we do not wait on the inode(s) to be written out
+ * but we always wait on the page cache pages to be written out.
+ *
+ * Note: In the past @filp could be NULL so we ignore it as we don't need it
+ * anyway.
+ *
+ * Locking: Caller must hold i_sem on the inode.
+ *
+ * TODO: We should probably also write all attribute/index inodes associated
+ * with this inode but since we have no simple way of getting to them we ignore
+ * this problem for now.  We do write the $BITMAP attribute if it is present
+ * which is the important one for a directory so things are not too bad.
+ */
+static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
+                int datasync)
+{
+        struct inode *vi = dentry->d_inode;
+        ntfs_inode *ni = NTFS_I(vi);
+        int err, ret;
+        ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+        BUG_ON(!S_ISDIR(vi->i_mode));
+        if (NInoIndexAllocPresent(ni) && ni->itype.index.bmp_ino)
+                write_inode_now(ni->itype.index.bmp_ino, !datasync);
+        ret = ntfs_write_inode(vi, 1);
+        write_inode_now(vi, !datasync);
+        err = sync_blockdev(vi->i_sb->s_bdev);
+        if (unlikely(err && !ret))
+                ret = err;
+        if (likely(!ret))
+                ntfs_debug("Done.");
+        else
+                ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
+                                "%u.", datasync ? "data" : "", vi->i_ino, -ret);
+        return ret;
+}
+#endif /* NTFS_RW */
+struct file_operations ntfs_dir_ops = {
+        .llseek         = generic_file_llseek,  /* Seek inside directory. */
+        .read           = generic_read_dir,     /* Return -EISDIR. */
+        .readdir        = ntfs_readdir,         /* Read directory contents. */
+#ifdef NTFS_RW
+        .fsync          = ntfs_dir_fsync,       /* Sync a directory to disk. */
+        /*.aio_fsync    = ,*/                   /* Sync all outstanding async
+                                                   i/o operations on a kiocb. */
+#endif /* NTFS_RW */
+        /*.ioctl        = ,*/                   /* Perform function on the
+                                                   mounted filesystem. */
+        .open           = ntfs_dir_open,        /* Open directory. */
+};
diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h
new file mode 100644
index 000000000000..aea7582d561f
--- /dev/null
+++ b/fs/ntfs/dir.h
@@ -0,0 +1,48 @@
+/*
+ * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of
+ *         the Linux-NTFS project.
+ *
+ * Copyright (c) 2002-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_DIR_H
+#define _LINUX_NTFS_DIR_H
+#include "layout.h"
+#include "inode.h"
+#include "types.h"
+/*
+ * ntfs_name is used to return the file name to the caller of
+ * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup())
+ * to be able to deal with dcache aliasing issues.
+ */
+typedef struct {
+        MFT_REF mref;
+        FILE_NAME_TYPE_FLAGS type;
+        u8 len;
+        ntfschar name[0];
+} __attribute__ ((__packed__)) ntfs_name;
+/* The little endian Unicode string $I30 as a global constant. */
+extern ntfschar I30[5];
+extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni,
+                const ntfschar *uname, const int uname_len, ntfs_name **res);
+#endif /* _LINUX_NTFS_FS_DIR_H */
diff --git a/fs/ntfs/endian.h b/fs/ntfs/endian.h
new file mode 100644
index 000000000000..927b5bf04b4f
--- /dev/null
+++ b/fs/ntfs/endian.h
@@ -0,0 +1,93 @@
+/*
+ * endian.h - Defines for endianness handling in NTFS Linux kernel driver.
+ *            Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_ENDIAN_H
+#define _LINUX_NTFS_ENDIAN_H
+#include <asm/byteorder.h>
+#include "types.h"
+/*
+ * Signed endianness conversion functions.
+ */
+static inline s16 sle16_to_cpu(sle16 x)
+{
+        return le16_to_cpu((__force le16)x);
+}
+static inline s32 sle32_to_cpu(sle32 x)
+{
+        return le32_to_cpu((__force le32)x);
+}
+static inline s64 sle64_to_cpu(sle64 x)
+{
+        return le64_to_cpu((__force le64)x);
+}
+static inline s16 sle16_to_cpup(sle16 *x)
+{
+        return le16_to_cpu(*(__force le16*)x);
+}
+static inline s32 sle32_to_cpup(sle32 *x)
+{
+        return le32_to_cpu(*(__force le32*)x);
+}
+static inline s64 sle64_to_cpup(sle64 *x)
+{
+        return le64_to_cpu(*(__force le64*)x);
+}
+static inline sle16 cpu_to_sle16(s16 x)
+{
+        return (__force sle16)cpu_to_le16(x);
+}
+static inline sle32 cpu_to_sle32(s32 x)
+{
+        return (__force sle32)cpu_to_le32(x);
+}
+static inline sle64 cpu_to_sle64(s64 x)
+{
+        return (__force sle64)cpu_to_le64(x);
+}
+static inline sle16 cpu_to_sle16p(s16 *x)
+{
+        return (__force sle16)cpu_to_le16(*x);
+}
+static inline sle32 cpu_to_sle32p(s32 *x)
+{
+        return (__force sle32)cpu_to_le32(*x);
+}
+static inline sle64 cpu_to_sle64p(s64 *x)
+{
+        return (__force sle64)cpu_to_le64(*x);
+}
+#endif /* _LINUX_NTFS_ENDIAN_H */
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
new file mode 100644
index 000000000000..db8713ea0d27
--- /dev/null
+++ b/fs/ntfs/file.c
@@ -0,0 +1,155 @@
+/*
+ * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include "inode.h"
+#include "debug.h"
+#include "ntfs.h"
+/**
+ * ntfs_file_open - called when an inode is about to be opened
+ * @vi:         inode to be opened
+ * @filp:       file structure describing the inode
+ *
+ * Limit file size to the page cache limit on architectures where unsigned long
+ * is 32-bits. This is the most we can do for now without overflowing the page
+ * cache page index. Doing it this way means we don't run into problems because
+ * of existing too large files. It would be better to allow the user to read
+ * the beginning of the file but I doubt very much anyone is going to hit this
+ * check on a 32-bit architecture, so there is no point in adding the extra
+ * complexity required to support this.
+ *
+ * On 64-bit architectures, the check is hopefully optimized away by the
+ * compiler.
+ *
+ * After the check passes, just call generic_file_open() to do its work.
+ */
+static int ntfs_file_open(struct inode *vi, struct file *filp)
+{
+        if (sizeof(unsigned long) < 8) {
+                if (vi->i_size > MAX_LFS_FILESIZE)
+                        return -EFBIG;
+        }
+        return generic_file_open(vi, filp);
+}
+#ifdef NTFS_RW
+/**
+ * ntfs_file_fsync - sync a file to disk
+ * @filp:       file to be synced
+ * @dentry:     dentry describing the file to sync
+ * @datasync:   if non-zero only flush user data and not metadata
+ *
+ * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
+ * system calls.  This function is inspired by fs/buffer.c::file_fsync().
+ *
+ * If @datasync is false, write the mft record and all associated extent mft
+ * records as well as the $DATA attribute and then sync the block device.
+ *
+ * If @datasync is true and the attribute is non-resident, we skip the writing
+ * of the mft record and all associated extent mft records (this might still
+ * happen due to the write_inode_now() call).
+ *
+ * Also, if @datasync is true, we do not wait on the inode to be written out
+ * but we always wait on the page cache pages to be written out.
+ *
+ * Note: In the past @filp could be NULL so we ignore it as we don't need it
+ * anyway.
+ *
+ * Locking: Caller must hold i_sem on the inode.
+ *
+ * TODO: We should probably also write all attribute/index inodes associated
+ * with this inode but since we have no simple way of getting to them we ignore
+ * this problem for now.
+ */
+static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
+                int datasync)
+{
+        struct inode *vi = dentry->d_inode;
+        int err, ret = 0;
+        ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+        BUG_ON(S_ISDIR(vi->i_mode));
+        if (!datasync || !NInoNonResident(NTFS_I(vi)))
+                ret = ntfs_write_inode(vi, 1);
+        write_inode_now(vi, !datasync);
+        err = sync_blockdev(vi->i_sb->s_bdev);
+        if (unlikely(err && !ret))
+                ret = err;
+        if (likely(!ret))
+                ntfs_debug("Done.");
+        else
+                ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
+                                "%u.", datasync ? "data" : "", vi->i_ino, -ret);
+        return ret;
+}
+#endif /* NTFS_RW */
+struct file_operations ntfs_file_ops = {
+        .llseek         = generic_file_llseek,    /* Seek inside file. */
+        .read           = generic_file_read,      /* Read from file. */
+        .aio_read       = generic_file_aio_read,  /* Async read from file. */
+        .readv          = generic_file_readv,     /* Read from file. */
+#ifdef NTFS_RW
+        .write          = generic_file_write,     /* Write to file. */
+        .aio_write      = generic_file_aio_write, /* Async write to file. */
+        .writev         = generic_file_writev,    /* Write to file. */
+        /*.release      = ,*/                     /* Last file is closed.  See
+                                                     fs/ext2/file.c::
+                                                     ext2_release_file() for
+                                                     how to use this to discard
+                                                     preallocated space for
+                                                     write opened files. */
+        .fsync          = ntfs_file_fsync,        /* Sync a file to disk. */
+        /*.aio_fsync    = ,*/                     /* Sync all outstanding async
+                                                     i/o operations on a
+                                                     kiocb. */
+#endif /* NTFS_RW */
+        /*.ioctl        = ,*/                     /* Perform function on the
+                                                     mounted filesystem. */
+        .mmap           = generic_file_mmap,      /* Mmap file. */
+        .open           = ntfs_file_open,         /* Open file. */
+        .sendfile       = generic_file_sendfile,  /* Zero-copy data send with
+                                                     the data source being on
+                                                     the ntfs partition.  We
+                                                     do not need to care about
+                                                     the data destination. */
+        /*.sendpage     = ,*/                     /* Zero-copy data send with
+                                                     the data destination being
+                                                     on the ntfs partition.  We
+                                                     do not need to care about
+                                                     the data source. */
+};
+struct inode_operations ntfs_file_inode_ops = {
+#ifdef NTFS_RW
+        .truncate       = ntfs_truncate_vfs,
+        .setattr        = ntfs_setattr,
+#endif /* NTFS_RW */
+};
+struct file_operations ntfs_empty_file_ops = {};
+struct inode_operations ntfs_empty_inode_ops = {};
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
new file mode 100644
index 000000000000..71bd2cd7a4d9
--- /dev/null
+++ b/fs/ntfs/index.c
@@ -0,0 +1,461 @@
+/*
+ * index.c - NTFS kernel index handling.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include "aops.h"
+#include "collate.h"
+#include "debug.h"
+#include "index.h"
+#include "ntfs.h"
+/**
+ * ntfs_index_ctx_get - allocate and initialize a new index context
+ * @idx_ni:     ntfs index inode with which to initialize the context
+ *
+ * Allocate a new index context, initialize it with @idx_ni and return it.
+ * Return NULL if allocation failed.
+ *
+ * Locking:  Caller must hold i_sem on the index inode.
+ */
+ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
+{
+        ntfs_index_context *ictx;
+        ictx = kmem_cache_alloc(ntfs_index_ctx_cache, SLAB_NOFS);
+        if (ictx) {
+                ictx->idx_ni = idx_ni;
+                ictx->entry = NULL;
+                ictx->data = NULL;
+                ictx->data_len = 0;
+                ictx->is_in_root = 0;
+                ictx->ir = NULL;
+                ictx->actx = NULL;
+                ictx->base_ni = NULL;
+                ictx->ia = NULL;
+                ictx->page = NULL;
+        }
+        return ictx;
+}
+/**
+ * ntfs_index_ctx_put - release an index context
+ * @ictx:       index context to free
+ *
+ * Release the index context @ictx, releasing all associated resources.
+ *
+ * Locking:  Caller must hold i_sem on the index inode.
+ */
+void ntfs_index_ctx_put(ntfs_index_context *ictx)
+{
+        if (ictx->entry) {
+                if (ictx->is_in_root) {
+                        if (ictx->actx)
+                                ntfs_attr_put_search_ctx(ictx->actx);
+                        if (ictx->base_ni)
+                                unmap_mft_record(ictx->base_ni);
+                } else {
+                        struct page *page = ictx->page;
+                        if (page) {
+                                BUG_ON(!PageLocked(page));
+                                unlock_page(page);
+                                ntfs_unmap_page(page);
+                        }
+                }
+        }
+        kmem_cache_free(ntfs_index_ctx_cache, ictx);
+        return;
+}
+/**
+ * ntfs_index_lookup - find a key in an index and return its index entry
+ * @key:        [IN] key for which to search in the index
+ * @key_len:    [IN] length of @key in bytes
+ * @ictx:       [IN/OUT] context describing the index and the returned entry
+ *
+ * Before calling ntfs_index_lookup(), @ictx must have been obtained from a
+ * call to ntfs_index_ctx_get().
+ *
+ * Look for the @key in the index specified by the index lookup context @ictx.
+ * ntfs_index_lookup() walks the contents of the index looking for the @key.
+ *
+ * If the @key is found in the index, 0 is returned and @ictx is setup to
+ * describe the index entry containing the matching @key.  @ictx->entry is the
+ * index entry and @ictx->data and @ictx->data_len are the index entry data and
+ * its length in bytes, respectively.
+ *
+ * If the @key is not found in the index, -ENOENT is returned and @ictx is
+ * setup to describe the index entry whose key collates immediately after the
+ * search @key, i.e. this is the position in the index at which an index entry
+ * with a key of @key would need to be inserted.
+ *
+ * If an error occurs return the negative error code and @ictx is left
+ * untouched.
+ *
+ * When finished with the entry and its data, call ntfs_index_ctx_put() to free
+ * the context and other associated resources.
+ *
+ * If the index entry was modified, call flush_dcache_index_entry_page()
+ * immediately after the modification and either ntfs_index_entry_mark_dirty()
+ * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
+ * ensure that the changes are written to disk.
+ *
+ * Locking:  - Caller must hold i_sem on the index inode.
+ *           - Each page cache page in the index allocation mapping must be
+ *             locked whilst being accessed otherwise we may find a corrupt
+ *             page due to it being under ->writepage at the moment which
+ *             applies the mst protection fixups before writing out and then
+ *             removes them again after the write is complete after which it 
+ *             unlocks the page.
+ */
+int ntfs_index_lookup(const void *key, const int key_len,
+                ntfs_index_context *ictx)
+{
+        VCN vcn, old_vcn;
+        ntfs_inode *idx_ni = ictx->idx_ni;
+        ntfs_volume *vol = idx_ni->vol;
+        struct super_block *sb = vol->sb;
+        ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino;
+        MFT_RECORD *m;
+        INDEX_ROOT *ir;
+        INDEX_ENTRY *ie;
+        INDEX_ALLOCATION *ia;
+        u8 *index_end, *kaddr;
+        ntfs_attr_search_ctx *actx;
+        struct address_space *ia_mapping;
+        struct page *page;
+        int rc, err = 0;
+        ntfs_debug("Entering.");
+        BUG_ON(!NInoAttr(idx_ni));
+        BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION);
+        BUG_ON(idx_ni->nr_extents != -1);
+        BUG_ON(!base_ni);
+        BUG_ON(!key);
+        BUG_ON(key_len <= 0);
+        if (!ntfs_is_collation_rule_supported(
+                        idx_ni->itype.index.collation_rule)) {
+                ntfs_error(sb, "Index uses unsupported collation rule 0x%x.  "
+                                "Aborting lookup.", le32_to_cpu(
+                                idx_ni->itype.index.collation_rule));
+                return -EOPNOTSUPP;
+        }
+        /* Get hold of the mft record for the index inode. */
+        m = map_mft_record(base_ni);
+        if (IS_ERR(m)) {
+                ntfs_error(sb, "map_mft_record() failed with error code %ld.",
+                                -PTR_ERR(m));
+                return PTR_ERR(m);
+        }
+        actx = ntfs_attr_get_search_ctx(base_ni, m);
+        if (unlikely(!actx)) {
+                err = -ENOMEM;
+                goto err_out;
+        }
+        /* Find the index root attribute in the mft record. */
+        err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len,
+                        CASE_SENSITIVE, 0, NULL, 0, actx);
+        if (unlikely(err)) {
+                if (err == -ENOENT) {
+                        ntfs_error(sb, "Index root attribute missing in inode "
+                                        "0x%lx.", idx_ni->mft_no);
+                        err = -EIO;
+                }
+                goto err_out;
+        }
+        /* Get to the index root value (it has been verified in read_inode). */
+        ir = (INDEX_ROOT*)((u8*)actx->attr +
+                        le16_to_cpu(actx->attr->data.resident.value_offset));
+        index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+        /* The first index entry. */
+        ie = (INDEX_ENTRY*)((u8*)&ir->index +
+                        le32_to_cpu(ir->index.entries_offset));
+        /*
+         * Loop until we exceed valid memory (corruption case) or until we
+         * reach the last entry.
+         */
+        for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+                /* Bounds checks. */
+                if ((u8*)ie < (u8*)actx->mrec || (u8*)ie +
+                                sizeof(INDEX_ENTRY_HEADER) > index_end ||
+                                (u8*)ie + le16_to_cpu(ie->length) > index_end)
+                        goto idx_err_out;
+                /*
+                 * The last entry cannot contain a key.  It can however contain
+                 * a pointer to a child node in the B+tree so we just break out.
+                 */
+                if (ie->flags & INDEX_ENTRY_END)
+                        break;
+                /* Further bounds checks. */
+                if ((u32)sizeof(INDEX_ENTRY_HEADER) +
+                                le16_to_cpu(ie->key_length) >
+                                le16_to_cpu(ie->data.vi.data_offset) ||
+                                (u32)le16_to_cpu(ie->data.vi.data_offset) +
+                                le16_to_cpu(ie->data.vi.data_length) >
+                                le16_to_cpu(ie->length))
+                        goto idx_err_out;
+                /* If the keys match perfectly, we setup @ictx and return 0. */
+                if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
+                                &ie->key, key_len)) {
+ir_done:
+                        ictx->is_in_root = TRUE;
+                        ictx->actx = actx;
+                        ictx->base_ni = base_ni;
+                        ictx->ia = NULL;
+                        ictx->page = NULL;
+done:
+                        ictx->entry = ie;
+                        ictx->data = (u8*)ie +
+                                        le16_to_cpu(ie->data.vi.data_offset);
+                        ictx->data_len = le16_to_cpu(ie->data.vi.data_length);
+                        ntfs_debug("Done.");
+                        return err;
+                }
+                /*
+                 * Not a perfect match, need to do full blown collation so we
+                 * know which way in the B+tree we have to go.
+                 */
+                rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
+                                key_len, &ie->key, le16_to_cpu(ie->key_length));
+                /*
+                 * If @key collates before the key of the current entry, there
+                 * is definitely no such key in this index but we might need to
+                 * descend into the B+tree so we just break out of the loop.
+                 */
+                if (rc == -1)
+                        break;
+                /*
+                 * A match should never happen as the memcmp() call should have
+                 * cought it, but we still treat it correctly.
+                 */
+                if (!rc)
+                        goto ir_done;
+                /* The keys are not equal, continue the search. */
+        }
+        /*
+         * We have finished with this index without success.  Check for the
+         * presence of a child node and if not present setup @ictx and return
+         * -ENOENT.
+         */
+        if (!(ie->flags & INDEX_ENTRY_NODE)) {
+                ntfs_debug("Entry not found.");
+                err = -ENOENT;
+                goto ir_done;
+        } /* Child node present, descend into it. */
+        /* Consistency check: Verify that an index allocation exists. */
+        if (!NInoIndexAllocPresent(idx_ni)) {
+                ntfs_error(sb, "No index allocation attribute but index entry "
+                                "requires one.  Inode 0x%lx is corrupt or "
+                                "driver bug.", idx_ni->mft_no);
+                goto err_out;
+        }
+        /* Get the starting vcn of the index_block holding the child node. */
+        vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
+        ia_mapping = VFS_I(idx_ni)->i_mapping;
+        /*
+         * We are done with the index root and the mft record.  Release them,
+         * otherwise we deadlock with ntfs_map_page().
+         */
+        ntfs_attr_put_search_ctx(actx);
+        unmap_mft_record(base_ni);
+        m = NULL;
+        actx = NULL;
+descend_into_child_node:
+        /*
+         * Convert vcn to index into the index allocation attribute in units
+         * of PAGE_CACHE_SIZE and map the page cache page, reading it from
+         * disk if necessary.
+         */
+        page = ntfs_map_page(ia_mapping, vcn <<
+                        idx_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+        if (IS_ERR(page)) {
+                ntfs_error(sb, "Failed to map index page, error %ld.",
+                                -PTR_ERR(page));
+                err = PTR_ERR(page);
+                goto err_out;
+        }
+        lock_page(page);
+        kaddr = (u8*)page_address(page);
+fast_descend_into_child_node:
+        /* Get to the index allocation block. */
+        ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
+                        idx_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+        /* Bounds checks. */
+        if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+                ntfs_error(sb, "Out of bounds check failed.  Corrupt inode "
+                                "0x%lx or driver bug.", idx_ni->mft_no);
+                goto unm_err_out;
+        }
+        /* Catch multi sector transfer fixup errors. */
+        if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+                ntfs_error(sb, "Index record with vcn 0x%llx is corrupt.  "
+                                "Corrupt inode 0x%lx.  Run chkdsk.",
+                                (long long)vcn, idx_ni->mft_no);
+                goto unm_err_out;
+        }
+        if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
+                ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
+                                "different from expected VCN (0x%llx).  Inode "
+                                "0x%lx is corrupt or driver bug.",
+                                (unsigned long long)
+                                sle64_to_cpu(ia->index_block_vcn),
+                                (unsigned long long)vcn, idx_ni->mft_no);
+                goto unm_err_out;
+        }
+        if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
+                        idx_ni->itype.index.block_size) {
+                ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has "
+                                "a size (%u) differing from the index "
+                                "specified size (%u).  Inode is corrupt or "
+                                "driver bug.", (unsigned long long)vcn,
+                                idx_ni->mft_no,
+                                le32_to_cpu(ia->index.allocated_size) + 0x18,
+                                idx_ni->itype.index.block_size);
+                goto unm_err_out;
+        }
+        index_end = (u8*)ia + idx_ni->itype.index.block_size;
+        if (index_end > kaddr + PAGE_CACHE_SIZE) {
+                ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx "
+                                "crosses page boundary.  Impossible!  Cannot "
+                                "access!  This is probably a bug in the "
+                                "driver.", (unsigned long long)vcn,
+                                idx_ni->mft_no);
+                goto unm_err_out;
+        }
+        index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
+        if (index_end > (u8*)ia + idx_ni->itype.index.block_size) {
+                ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode "
+                                "0x%lx exceeds maximum size.",
+                                (unsigned long long)vcn, idx_ni->mft_no);
+                goto unm_err_out;
+        }
+        /* The first index entry. */
+        ie = (INDEX_ENTRY*)((u8*)&ia->index +
+                        le32_to_cpu(ia->index.entries_offset));
+        /*
+         * Iterate similar to above big loop but applied to index buffer, thus
+         * loop until we exceed valid memory (corruption case) or until we
+         * reach the last entry.
+         */
+        for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+                /* Bounds checks. */
+                if ((u8*)ie < (u8*)ia || (u8*)ie +
+                                sizeof(INDEX_ENTRY_HEADER) > index_end ||
+                                (u8*)ie + le16_to_cpu(ie->length) > index_end) {
+                        ntfs_error(sb, "Index entry out of bounds in inode "
+                                        "0x%lx.", idx_ni->mft_no);
+                        goto unm_err_out;
+                }
+                /*
+                 * The last entry cannot contain a key.  It can however contain
+                 * a pointer to a child node in the B+tree so we just break out.
+                 */
+                if (ie->flags & INDEX_ENTRY_END)
+                        break;
+                /* Further bounds checks. */
+                if ((u32)sizeof(INDEX_ENTRY_HEADER) +
+                                le16_to_cpu(ie->key_length) >
+                                le16_to_cpu(ie->data.vi.data_offset) ||
+                                (u32)le16_to_cpu(ie->data.vi.data_offset) +
+                                le16_to_cpu(ie->data.vi.data_length) >
+                                le16_to_cpu(ie->length)) {
+                        ntfs_error(sb, "Index entry out of bounds in inode "
+                                        "0x%lx.", idx_ni->mft_no);
+                        goto unm_err_out;
+                }
+                /* If the keys match perfectly, we setup @ictx and return 0. */
+                if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
+                                &ie->key, key_len)) {
+ia_done:
+                        ictx->is_in_root = FALSE;
+                        ictx->actx = NULL;
+                        ictx->base_ni = NULL;
+                        ictx->ia = ia;
+                        ictx->page = page;
+                        goto done;
+                }
+                /*
+                 * Not a perfect match, need to do full blown collation so we
+                 * know which way in the B+tree we have to go.
+                 */
+                rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
+                                key_len, &ie->key, le16_to_cpu(ie->key_length));
+                /*
+                 * If @key collates before the key of the current entry, there
+                 * is definitely no such key in this index but we might need to
+                 * descend into the B+tree so we just break out of the loop.
+                 */
+                if (rc == -1)
+                        break;
+                /*
+                 * A match should never happen as the memcmp() call should have
+                 * cought it, but we still treat it correctly.
+                 */
+                if (!rc)
+                        goto ia_done;
+                /* The keys are not equal, continue the search. */
+        }
+        /*
+         * We have finished with this index buffer without success.  Check for
+         * the presence of a child node and if not present return -ENOENT.
+         */
+        if (!(ie->flags & INDEX_ENTRY_NODE)) {
+                ntfs_debug("Entry not found.");
+                err = -ENOENT;
+                goto ia_done;
+        }
+        if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
+                ntfs_error(sb, "Index entry with child node found in a leaf "
+                                "node in inode 0x%lx.", idx_ni->mft_no);
+                goto unm_err_out;
+        }
+        /* Child node present, descend into it. */
+        old_vcn = vcn;
+        vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
+        if (vcn >= 0) {
+                /*
+                 * If vcn is in the same page cache page as old_vcn we recycle
+                 * the mapped page.
+                 */
+                if (old_vcn << vol->cluster_size_bits >>
+                                PAGE_CACHE_SHIFT == vcn <<
+                                vol->cluster_size_bits >>
+                                PAGE_CACHE_SHIFT)
+                        goto fast_descend_into_child_node;
+                unlock_page(page);
+                ntfs_unmap_page(page);
+                goto descend_into_child_node;
+        }
+        ntfs_error(sb, "Negative child node vcn in inode 0x%lx.",
+                        idx_ni->mft_no);
+unm_err_out:
+        unlock_page(page);
+        ntfs_unmap_page(page);
+err_out:
+        if (!err)
+                err = -EIO;
+        if (actx)
+                ntfs_attr_put_search_ctx(actx);
+        if (m)
+                unmap_mft_record(base_ni);
+        return err;
+idx_err_out:
+        ntfs_error(sb, "Corrupt index.  Aborting lookup.");
+        goto err_out;
+}
diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h
new file mode 100644
index 000000000000..846a489e8692
--- /dev/null
+++ b/fs/ntfs/index.h
@@ -0,0 +1,148 @@
+/*
+ * index.h - Defines for NTFS kernel index handling.  Part of the Linux-NTFS
+ *           project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_INDEX_H
+#define _LINUX_NTFS_INDEX_H
+#include <linux/fs.h>
+#include "types.h"
+#include "layout.h"
+#include "inode.h"
+#include "attrib.h"
+#include "mft.h"
+#include "aops.h"
+/**
+ * @idx_ni:     index inode containing the @entry described by this context
+ * @entry:      index entry (points into @ir or @ia)
+ * @data:       index entry data (points into @entry)
+ * @data_len:   length in bytes of @data
+ * @is_in_root: TRUE if @entry is in @ir and FALSE if it is in @ia
+ * @ir:         index root if @is_in_root and NULL otherwise
+ * @actx:       attribute search context if @is_in_root and NULL otherwise
+ * @base_ni:    base inode if @is_in_root and NULL otherwise
+ * @ia:         index block if @is_in_root is FALSE and NULL otherwise
+ * @page:       page if @is_in_root is FALSE and NULL otherwise
+ *
+ * @idx_ni is the index inode this context belongs to.
+ *
+ * @entry is the index entry described by this context.  @data and @data_len
+ * are the index entry data and its length in bytes, respectively.  @data
+ * simply points into @entry.  This is probably what the user is interested in.
+ *
+ * If @is_in_root is TRUE, @entry is in the index root attribute @ir described
+ * by the attribute search context @actx and the base inode @base_ni.  @ia and
+ * @page are NULL in this case.
+ *
+ * If @is_in_root is FALSE, @entry is in the index allocation attribute and @ia
+ * and @page point to the index allocation block and the mapped, locked page it
+ * is in, respectively.  @ir, @actx and @base_ni are NULL in this case.
+ *
+ * To obtain a context call ntfs_index_ctx_get().
+ *
+ * We use this context to allow ntfs_index_lookup() to return the found index
+ * @entry and its @data without having to allocate a buffer and copy the @entry
+ * and/or its @data into it.
+ *
+ * When finished with the @entry and its @data, call ntfs_index_ctx_put() to
+ * free the context and other associated resources.
+ *
+ * If the index entry was modified, call flush_dcache_index_entry_page()
+ * immediately after the modification and either ntfs_index_entry_mark_dirty()
+ * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
+ * ensure that the changes are written to disk.
+ */
+typedef struct {
+        ntfs_inode *idx_ni;
+        INDEX_ENTRY *entry;
+        void *data;
+        u16 data_len;
+        BOOL is_in_root;
+        INDEX_ROOT *ir;
+        ntfs_attr_search_ctx *actx;
+        ntfs_inode *base_ni;
+        INDEX_ALLOCATION *ia;
+        struct page *page;
+} ntfs_index_context;
+extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni);
+extern void ntfs_index_ctx_put(ntfs_index_context *ictx);
+extern int ntfs_index_lookup(const void *key, const int key_len,
+                ntfs_index_context *ictx);
+#ifdef NTFS_RW
+/**
+ * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries
+ * @ictx:       ntfs index context describing the index entry
+ *
+ * Call flush_dcache_page() for the page in which an index entry resides.
+ *
+ * This must be called every time an index entry is modified, just after the
+ * modification.
+ *
+ * If the index entry is in the index root attribute, simply flush the page
+ * containing the mft record containing the index root attribute.
+ *
+ * If the index entry is in an index block belonging to the index allocation
+ * attribute, simply flush the page cache page containing the index block.
+ */
+static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx)
+{
+        if (ictx->is_in_root)
+                flush_dcache_mft_record_page(ictx->actx->ntfs_ino);
+        else
+                flush_dcache_page(ictx->page);
+}
+/**
+ * ntfs_index_entry_mark_dirty - mark an index entry dirty
+ * @ictx:       ntfs index context describing the index entry
+ *
+ * Mark the index entry described by the index entry context @ictx dirty.
+ *
+ * If the index entry is in the index root attribute, simply mark the mft
+ * record containing the index root attribute dirty.  This ensures the mft
+ * record, and hence the index root attribute, will be written out to disk
+ * later.
+ *
+ * If the index entry is in an index block belonging to the index allocation
+ * attribute, mark the buffers belonging to the index record as well as the
+ * page cache page the index block is in dirty.  This automatically marks the
+ * VFS inode of the ntfs index inode to which the index entry belongs dirty,
+ * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the
+ * dirty index block, will be written out to disk later.
+ */
+static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx)
+{
+        if (ictx->is_in_root)
+                mark_mft_record_dirty(ictx->actx->ntfs_ino);
+        else
+                mark_ntfs_record_dirty(ictx->page,
+                                (u8*)ictx->ia - (u8*)page_address(ictx->page));
+}
+#endif /* NTFS_RW */
+#endif /* _LINUX_NTFS_INDEX_H */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
new file mode 100644
index 000000000000..31840ba0b38c
--- /dev/null
+++ b/fs/ntfs/inode.c
@@ -0,0 +1,2616 @@
+/**
+ * inode.c - NTFS kernel inode handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/smp_lock.h>
+#include <linux/quotaops.h>
+#include <linux/mount.h>
+#include "aops.h"
+#include "dir.h"
+#include "debug.h"
+#include "inode.h"
+#include "attrib.h"
+#include "malloc.h"
+#include "mft.h"
+#include "time.h"
+#include "ntfs.h"
+/**
+ * ntfs_test_inode - compare two (possibly fake) inodes for equality
+ * @vi:         vfs inode which to test
+ * @na:         ntfs attribute which is being tested with
+ *
+ * Compare the ntfs attribute embedded in the ntfs specific part of the vfs
+ * inode @vi for equality with the ntfs attribute @na.
+ *
+ * If searching for the normal file/directory inode, set @na->type to AT_UNUSED.
+ * @na->name and @na->name_len are then ignored.
+ *
+ * Return 1 if the attributes match and 0 if not.
+ *
+ * NOTE: This function runs with the inode_lock spin lock held so it is not
+ * allowed to sleep.
+ */
+int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
+{
+        ntfs_inode *ni;
+        if (vi->i_ino != na->mft_no)
+                return 0;
+        ni = NTFS_I(vi);
+        /* If !NInoAttr(ni), @vi is a normal file or directory inode. */
+        if (likely(!NInoAttr(ni))) {
+                /* If not looking for a normal inode this is a mismatch. */
+                if (unlikely(na->type != AT_UNUSED))
+                        return 0;
+        } else {
+                /* A fake inode describing an attribute. */
+                if (ni->type != na->type)
+                        return 0;
+                if (ni->name_len != na->name_len)
+                        return 0;
+                if (na->name_len && memcmp(ni->name, na->name,
+                                na->name_len * sizeof(ntfschar)))
+                        return 0;
+        }
+        /* Match! */
+        return 1;
+}
+/**
+ * ntfs_init_locked_inode - initialize an inode
+ * @vi:         vfs inode to initialize
+ * @na:         ntfs attribute which to initialize @vi to
+ *
+ * Initialize the vfs inode @vi with the values from the ntfs attribute @na in
+ * order to enable ntfs_test_inode() to do its work.
+ *
+ * If initializing the normal file/directory inode, set @na->type to AT_UNUSED.
+ * In that case, @na->name and @na->name_len should be set to NULL and 0,
+ * respectively. Although that is not strictly necessary as
+ * ntfs_read_inode_locked() will fill them in later.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * NOTE: This function runs with the inode_lock spin lock held so it is not
+ * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
+ */
+static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
+{
+        ntfs_inode *ni = NTFS_I(vi);
+        vi->i_ino = na->mft_no;
+        ni->type = na->type;
+        if (na->type == AT_INDEX_ALLOCATION)
+                NInoSetMstProtected(ni);
+        ni->name = na->name;
+        ni->name_len = na->name_len;
+        /* If initializing a normal inode, we are done. */
+        if (likely(na->type == AT_UNUSED)) {
+                BUG_ON(na->name);
+                BUG_ON(na->name_len);
+                return 0;
+        }
+        /* It is a fake inode. */
+        NInoSetAttr(ni);
+        /*
+         * We have I30 global constant as an optimization as it is the name
+         * in >99.9% of named attributes! The other <0.1% incur a GFP_ATOMIC
+         * allocation but that is ok. And most attributes are unnamed anyway,
+         * thus the fraction of named attributes with name != I30 is actually
+         * absolutely tiny.
+         */
+        if (na->name_len && na->name != I30) {
+                unsigned int i;
+                BUG_ON(!na->name);
+                i = na->name_len * sizeof(ntfschar);
+                ni->name = (ntfschar*)kmalloc(i + sizeof(ntfschar), GFP_ATOMIC);
+                if (!ni->name)
+                        return -ENOMEM;
+                memcpy(ni->name, na->name, i);
+                ni->name[i] = 0;
+        }
+        return 0;
+}
+typedef int (*set_t)(struct inode *, void *);
+static int ntfs_read_locked_inode(struct inode *vi);
+static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi);
+static int ntfs_read_locked_index_inode(struct inode *base_vi,
+                struct inode *vi);
+/**
+ * ntfs_iget - obtain a struct inode corresponding to a specific normal inode
+ * @sb:         super block of mounted volume
+ * @mft_no:     mft record number / inode number to obtain
+ *
+ * Obtain the struct inode corresponding to a specific normal inode (i.e. a
+ * file or directory).
+ *
+ * If the inode is in the cache, it is just returned with an increased
+ * reference count. Otherwise, a new struct inode is allocated and initialized,
+ * and finally ntfs_read_locked_inode() is called to read in the inode and
+ * fill in the remainder of the inode structure.
+ *
+ * Return the struct inode on success. Check the return value with IS_ERR() and
+ * if true, the function failed and the error code is obtained from PTR_ERR().
+ */
+struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no)
+{
+        struct inode *vi;
+        ntfs_attr na;
+        int err;
+        na.mft_no = mft_no;
+        na.type = AT_UNUSED;
+        na.name = NULL;
+        na.name_len = 0;
+        vi = iget5_locked(sb, mft_no, (test_t)ntfs_test_inode,
+                        (set_t)ntfs_init_locked_inode, &na);
+        if (!vi)
+                return ERR_PTR(-ENOMEM);
+        err = 0;
+        /* If this is a freshly allocated inode, need to read it now. */
+        if (vi->i_state & I_NEW) {
+                err = ntfs_read_locked_inode(vi);
+                unlock_new_inode(vi);
+        }
+        /*
+         * There is no point in keeping bad inodes around if the failure was
+         * due to ENOMEM. We want to be able to retry again later.
+         */
+        if (err == -ENOMEM) {
+                iput(vi);
+                vi = ERR_PTR(err);
+        }
+        return vi;
+}
+/**
+ * ntfs_attr_iget - obtain a struct inode corresponding to an attribute
+ * @base_vi:    vfs base inode containing the attribute
+ * @type:       attribute type
+ * @name:       Unicode name of the attribute (NULL if unnamed)
+ * @name_len:   length of @name in Unicode characters (0 if unnamed)
+ *
+ * Obtain the (fake) struct inode corresponding to the attribute specified by
+ * @type, @name, and @name_len, which is present in the base mft record
+ * specified by the vfs inode @base_vi.
+ *
+ * If the attribute inode is in the cache, it is just returned with an
+ * increased reference count. Otherwise, a new struct inode is allocated and
+ * initialized, and finally ntfs_read_locked_attr_inode() is called to read the
+ * attribute and fill in the inode structure.
+ *
+ * Note, for index allocation attributes, you need to use ntfs_index_iget()
+ * instead of ntfs_attr_iget() as working with indices is a lot more complex.
+ *
+ * Return the struct inode of the attribute inode on success. Check the return
+ * value with IS_ERR() and if true, the function failed and the error code is
+ * obtained from PTR_ERR().
+ */
+struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
+                ntfschar *name, u32 name_len)
+{
+        struct inode *vi;
+        ntfs_attr na;
+        int err;
+        /* Make sure no one calls ntfs_attr_iget() for indices. */
+        BUG_ON(type == AT_INDEX_ALLOCATION);
+        na.mft_no = base_vi->i_ino;
+        na.type = type;
+        na.name = name;
+        na.name_len = name_len;
+        vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode,
+                        (set_t)ntfs_init_locked_inode, &na);
+        if (!vi)
+                return ERR_PTR(-ENOMEM);
+        err = 0;
+        /* If this is a freshly allocated inode, need to read it now. */
+        if (vi->i_state & I_NEW) {
+                err = ntfs_read_locked_attr_inode(base_vi, vi);
+                unlock_new_inode(vi);
+        }
+        /*
+         * There is no point in keeping bad attribute inodes around. This also
+         * simplifies things in that we never need to check for bad attribute
+         * inodes elsewhere.
+         */
+        if (err) {
+                iput(vi);
+                vi = ERR_PTR(err);
+        }
+        return vi;
+}
+/**
+ * ntfs_index_iget - obtain a struct inode corresponding to an index
+ * @base_vi:    vfs base inode containing the index related attributes
+ * @name:       Unicode name of the index
+ * @name_len:   length of @name in Unicode characters
+ *
+ * Obtain the (fake) struct inode corresponding to the index specified by @name
+ * and @name_len, which is present in the base mft record specified by the vfs
+ * inode @base_vi.
+ *
+ * If the index inode is in the cache, it is just returned with an increased
+ * reference count.  Otherwise, a new struct inode is allocated and
+ * initialized, and finally ntfs_read_locked_index_inode() is called to read
+ * the index related attributes and fill in the inode structure.
+ *
+ * Return the struct inode of the index inode on success. Check the return
+ * value with IS_ERR() and if true, the function failed and the error code is
+ * obtained from PTR_ERR().
+ */
+struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
+                u32 name_len)
+{
+        struct inode *vi;
+        ntfs_attr na;
+        int err;
+        na.mft_no = base_vi->i_ino;
+        na.type = AT_INDEX_ALLOCATION;
+        na.name = name;
+        na.name_len = name_len;
+        vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode,
+                        (set_t)ntfs_init_locked_inode, &na);
+        if (!vi)
+                return ERR_PTR(-ENOMEM);
+        err = 0;
+        /* If this is a freshly allocated inode, need to read it now. */
+        if (vi->i_state & I_NEW) {
+                err = ntfs_read_locked_index_inode(base_vi, vi);
+                unlock_new_inode(vi);
+        }
+        /*
+         * There is no point in keeping bad index inodes around.  This also
+         * simplifies things in that we never need to check for bad index
+         * inodes elsewhere.
+         */
+        if (err) {
+                iput(vi);
+                vi = ERR_PTR(err);
+        }
+        return vi;
+}
+struct inode *ntfs_alloc_big_inode(struct super_block *sb)
+{
+        ntfs_inode *ni;
+        ntfs_debug("Entering.");
+        ni = (ntfs_inode *)kmem_cache_alloc(ntfs_big_inode_cache,
+                        SLAB_NOFS);
+        if (likely(ni != NULL)) {
+                ni->state = 0;
+                return VFS_I(ni);
+        }
+        ntfs_error(sb, "Allocation of NTFS big inode structure failed.");
+        return NULL;
+}
+void ntfs_destroy_big_inode(struct inode *inode)
+{
+        ntfs_inode *ni = NTFS_I(inode);
+        ntfs_debug("Entering.");
+        BUG_ON(ni->page);
+        if (!atomic_dec_and_test(&ni->count))
+                BUG();
+        kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+}
+static inline ntfs_inode *ntfs_alloc_extent_inode(void)
+{
+        ntfs_inode *ni;
+        ntfs_debug("Entering.");
+        ni = (ntfs_inode *)kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS);
+        if (likely(ni != NULL)) {
+                ni->state = 0;
+                return ni;
+        }
+        ntfs_error(NULL, "Allocation of NTFS inode structure failed.");
+        return NULL;
+}
+static void ntfs_destroy_extent_inode(ntfs_inode *ni)
+{
+        ntfs_debug("Entering.");
+        BUG_ON(ni->page);
+        if (!atomic_dec_and_test(&ni->count))
+                BUG();
+        kmem_cache_free(ntfs_inode_cache, ni);
+}
+/**
+ * __ntfs_init_inode - initialize ntfs specific part of an inode
+ * @sb:         super block of mounted volume
+ * @ni:         freshly allocated ntfs inode which to initialize
+ *
+ * Initialize an ntfs inode to defaults.
+ *
+ * NOTE: ni->mft_no, ni->state, ni->type, ni->name, and ni->name_len are left
+ * untouched. Make sure to initialize them elsewhere.
+ *
+ * Return zero on success and -ENOMEM on error.
+ */
+void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni)
+{
+        ntfs_debug("Entering.");
+        ni->initialized_size = ni->allocated_size = 0;
+        ni->seq_no = 0;
+        atomic_set(&ni->count, 1);
+        ni->vol = NTFS_SB(sb);
+        ntfs_init_runlist(&ni->runlist);
+        init_MUTEX(&ni->mrec_lock);
+        ni->page = NULL;
+        ni->page_ofs = 0;
+        ni->attr_list_size = 0;
+        ni->attr_list = NULL;
+        ntfs_init_runlist(&ni->attr_list_rl);
+        ni->itype.index.bmp_ino = NULL;
+        ni->itype.index.block_size = 0;
+        ni->itype.index.vcn_size = 0;
+        ni->itype.index.collation_rule = 0;
+        ni->itype.index.block_size_bits = 0;
+        ni->itype.index.vcn_size_bits = 0;
+        init_MUTEX(&ni->extent_lock);
+        ni->nr_extents = 0;
+        ni->ext.base_ntfs_ino = NULL;
+}
+inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
+                unsigned long mft_no)
+{
+        ntfs_inode *ni = ntfs_alloc_extent_inode();
+        ntfs_debug("Entering.");
+        if (likely(ni != NULL)) {
+                __ntfs_init_inode(sb, ni);
+                ni->mft_no = mft_no;
+                ni->type = AT_UNUSED;
+                ni->name = NULL;
+                ni->name_len = 0;
+        }
+        return ni;
+}
+/**
+ * ntfs_is_extended_system_file - check if a file is in the $Extend directory
+ * @ctx:        initialized attribute search context
+ *
+ * Search all file name attributes in the inode described by the attribute
+ * search context @ctx and check if any of the names are in the $Extend system
+ * directory.
+ *
+ * Return values:
+ *         1: file is in $Extend directory
+ *         0: file is not in $Extend directory
+ *    -errno: failed to determine if the file is in the $Extend directory
+ */
+static int ntfs_is_extended_system_file(ntfs_attr_search_ctx *ctx)
+{
+        int nr_links, err;
+        /* Restart search. */
+        ntfs_attr_reinit_search_ctx(ctx);
+        /* Get number of hard links. */
+        nr_links = le16_to_cpu(ctx->mrec->link_count);
+        /* Loop through all hard links. */
+        while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0,
+                        ctx))) {
+                FILE_NAME_ATTR *file_name_attr;
+                ATTR_RECORD *attr = ctx->attr;
+                u8 *p, *p2;
+                nr_links--;
+                /*
+                 * Maximum sanity checking as we are called on an inode that
+                 * we suspect might be corrupt.
+                 */
+                p = (u8*)attr + le32_to_cpu(attr->length);
+                if (p < (u8*)ctx->mrec || (u8*)p > (u8*)ctx->mrec +
+                                le32_to_cpu(ctx->mrec->bytes_in_use)) {
+err_corrupt_attr:
+                        ntfs_error(ctx->ntfs_ino->vol->sb, "Corrupt file name "
+                                        "attribute. You should run chkdsk.");
+                        return -EIO;
+                }
+                if (attr->non_resident) {
+                        ntfs_error(ctx->ntfs_ino->vol->sb, "Non-resident file "
+                                        "name. You should run chkdsk.");
+                        return -EIO;
+                }
+                if (attr->flags) {
+                        ntfs_error(ctx->ntfs_ino->vol->sb, "File name with "
+                                        "invalid flags. You should run "
+                                        "chkdsk.");
+                        return -EIO;
+                }
+                if (!(attr->data.resident.flags & RESIDENT_ATTR_IS_INDEXED)) {
+                        ntfs_error(ctx->ntfs_ino->vol->sb, "Unindexed file "
+                                        "name. You should run chkdsk.");
+                        return -EIO;
+                }
+                file_name_attr = (FILE_NAME_ATTR*)((u8*)attr +
+                                le16_to_cpu(attr->data.resident.value_offset));
+                p2 = (u8*)attr + le32_to_cpu(attr->data.resident.value_length);
+                if (p2 < (u8*)attr || p2 > p)
+                        goto err_corrupt_attr;
+                /* This attribute is ok, but is it in the $Extend directory? */
+                if (MREF_LE(file_name_attr->parent_directory) == FILE_Extend)
+                        return 1;       /* YES, it's an extended system file. */
+        }
+        if (unlikely(err != -ENOENT))
+                return err;
+        if (unlikely(nr_links)) {
+                ntfs_error(ctx->ntfs_ino->vol->sb, "Inode hard link count "
+                                "doesn't match number of name attributes. You "
+                                "should run chkdsk.");
+                return -EIO;
+        }
+        return 0;       /* NO, it is not an extended system file. */
+}
+/**
+ * ntfs_read_locked_inode - read an inode from its device
+ * @vi:         inode to read
+ *
+ * ntfs_read_locked_inode() is called from ntfs_iget() to read the inode
+ * described by @vi into memory from the device.
+ *
+ * The only fields in @vi that we need to/can look at when the function is
+ * called are i_sb, pointing to the mounted device's super block, and i_ino,
+ * the number of the inode to load.
+ *
+ * ntfs_read_locked_inode() maps, pins and locks the mft record number i_ino
+ * for reading and sets up the necessary @vi fields as well as initializing
+ * the ntfs inode.
+ *
+ * Q: What locks are held when the function is called?
+ * A: i_state has I_LOCK set, hence the inode is locked, also
+ *    i_count is set to 1, so it is not going to go away
+ *    i_flags is set to 0 and we have no business touching it.  Only an ioctl()
+ *    is allowed to write to them. We should of course be honouring them but
+ *    we need to do that using the IS_* macros defined in include/linux/fs.h.
+ *    In any case ntfs_read_locked_inode() has nothing to do with i_flags.
+ *
+ * Return 0 on success and -errno on error.  In the error case, the inode will
+ * have had make_bad_inode() executed on it.
+ */
+static int ntfs_read_locked_inode(struct inode *vi)
+{
+        ntfs_volume *vol = NTFS_SB(vi->i_sb);
+        ntfs_inode *ni;
+        MFT_RECORD *m;
+        STANDARD_INFORMATION *si;
+        ntfs_attr_search_ctx *ctx;
+        int err = 0;
+        ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
+        /* Setup the generic vfs inode parts now. */
+        /* This is the optimal IO size (for stat), not the fs block size. */
+        vi->i_blksize = PAGE_CACHE_SIZE;
+        /*
+         * This is for checking whether an inode has changed w.r.t. a file so
+         * that the file can be updated if necessary (compare with f_version).
+         */
+        vi->i_version = 1;
+        vi->i_uid = vol->uid;
+        vi->i_gid = vol->gid;
+        vi->i_mode = 0;
+        /*
+         * Initialize the ntfs specific part of @vi special casing
+         * FILE_MFT which we need to do at mount time.
+         */
+        if (vi->i_ino != FILE_MFT)
+                ntfs_init_big_inode(vi);
+        ni = NTFS_I(vi);
+        m = map_mft_record(ni);
+        if (IS_ERR(m)) {
+                err = PTR_ERR(m);
+                goto err_out;
+        }
+        ctx = ntfs_attr_get_search_ctx(ni, m);
+        if (!ctx) {
+                err = -ENOMEM;
+                goto unm_err_out;
+        }
+        if (!(m->flags & MFT_RECORD_IN_USE)) {
+                ntfs_error(vi->i_sb, "Inode is not in use!");
+                goto unm_err_out;
+        }
+        if (m->base_mft_record) {
+                ntfs_error(vi->i_sb, "Inode is an extent inode!");
+                goto unm_err_out;
+        }
+        /* Transfer information from mft record into vfs and ntfs inodes. */
+        vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
+        /*
+         * FIXME: Keep in mind that link_count is two for files which have both
+         * a long file name and a short file name as separate entries, so if
+         * we are hiding short file names this will be too high. Either we need
+         * to account for the short file names by subtracting them or we need
+         * to make sure we delete files even though i_nlink is not zero which
+         * might be tricky due to vfs interactions. Need to think about this
+         * some more when implementing the unlink command.
+         */
+        vi->i_nlink = le16_to_cpu(m->link_count);
+        /*
+         * FIXME: Reparse points can have the directory bit set even though
+         * they would be S_IFLNK. Need to deal with this further below when we
+         * implement reparse points / symbolic links but it will do for now.
+         * Also if not a directory, it could be something else, rather than
+         * a regular file. But again, will do for now.
+         */
+        /* Everyone gets all permissions. */
+        vi->i_mode |= S_IRWXUGO;
+        /* If read-only, noone gets write permissions. */
+        if (IS_RDONLY(vi))
+                vi->i_mode &= ~S_IWUGO;
+        if (m->flags & MFT_RECORD_IS_DIRECTORY) {
+                vi->i_mode |= S_IFDIR;
+                /*
+                 * Apply the directory permissions mask set in the mount
+                 * options.
+                 */
+                vi->i_mode &= ~vol->dmask;
+                /* Things break without this kludge! */
+                if (vi->i_nlink > 1)
+                        vi->i_nlink = 1;
+        } else {
+                vi->i_mode |= S_IFREG;
+                /* Apply the file permissions mask set in the mount options. */
+                vi->i_mode &= ~vol->fmask;
+        }
+        /*
+         * Find the standard information attribute in the mft record. At this
+         * stage we haven't setup the attribute list stuff yet, so this could
+         * in fact fail if the standard information is in an extent record, but
+         * I don't think this actually ever happens.
+         */
+        err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, 0, 0, NULL, 0,
+                        ctx);
+        if (unlikely(err)) {
+                if (err == -ENOENT) {
+                        /*
+                         * TODO: We should be performing a hot fix here (if the
+                         * recover mount option is set) by creating a new
+                         * attribute.
+                         */
+                        ntfs_error(vi->i_sb, "$STANDARD_INFORMATION attribute "
+                                        "is missing.");
+                }
+                goto unm_err_out;
+        }
+        /* Get the standard information attribute value. */
+        si = (STANDARD_INFORMATION*)((char*)ctx->attr +
+                        le16_to_cpu(ctx->attr->data.resident.value_offset));
+        /* Transfer information from the standard information into vi. */
+        /*
+         * Note: The i_?times do not quite map perfectly onto the NTFS times,
+         * but they are close enough, and in the end it doesn't really matter
+         * that much...
+         */
+        /*
+         * mtime is the last change of the data within the file. Not changed
+         * when only metadata is changed, e.g. a rename doesn't affect mtime.
+         */
+        vi->i_mtime = ntfs2utc(si->last_data_change_time);
+        /*
+         * ctime is the last change of the metadata of the file. This obviously
+         * always changes, when mtime is changed. ctime can be changed on its
+         * own, mtime is then not changed, e.g. when a file is renamed.
+         */
+        vi->i_ctime = ntfs2utc(si->last_mft_change_time);
+        /*
+         * Last access to the data within the file. Not changed during a rename
+         * for example but changed whenever the file is written to.
+         */
+        vi->i_atime = ntfs2utc(si->last_access_time);
+        /* Find the attribute list attribute if present. */
+        ntfs_attr_reinit_search_ctx(ctx);
+        err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
+        if (err) {
+                if (unlikely(err != -ENOENT)) {
+                        ntfs_error(vi->i_sb, "Failed to lookup attribute list "
+                                        "attribute.");
+                        goto unm_err_out;
+                }
+        } else /* if (!err) */ {
+                if (vi->i_ino == FILE_MFT)
+                        goto skip_attr_list_load;
+                ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino);
+                NInoSetAttrList(ni);
+                if (ctx->attr->flags & ATTR_IS_ENCRYPTED ||
+                                ctx->attr->flags & ATTR_COMPRESSION_MASK ||
+                                ctx->attr->flags & ATTR_IS_SPARSE) {
+                        ntfs_error(vi->i_sb, "Attribute list attribute is "
+                                        "compressed/encrypted/sparse.");
+                        goto unm_err_out;
+                }
+                /* Now allocate memory for the attribute list. */
+                ni->attr_list_size = (u32)ntfs_attr_size(ctx->attr);
+                ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
+                if (!ni->attr_list) {
+                        ntfs_error(vi->i_sb, "Not enough memory to allocate "
+                                        "buffer for attribute list.");
+                        err = -ENOMEM;
+                        goto unm_err_out;
+                }
+                if (ctx->attr->non_resident) {
+                        NInoSetAttrListNonResident(ni);
+                        if (ctx->attr->data.non_resident.lowest_vcn) {
+                                ntfs_error(vi->i_sb, "Attribute list has non "
+                                                "zero lowest_vcn.");
+                                goto unm_err_out;
+                        }
+                        /*
+                         * Setup the runlist. No need for locking as we have
+                         * exclusive access to the inode at this time.
+                         */
+                        ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
+                                        ctx->attr, NULL);
+                        if (IS_ERR(ni->attr_list_rl.rl)) {
+                                err = PTR_ERR(ni->attr_list_rl.rl);
+                                ni->attr_list_rl.rl = NULL;
+                                ntfs_error(vi->i_sb, "Mapping pairs "
+                                                "decompression failed.");
+                                goto unm_err_out;
+                        }
+                        /* Now load the attribute list. */
+                        if ((err = load_attribute_list(vol, &ni->attr_list_rl,
+                                        ni->attr_list, ni->attr_list_size,
+                                        sle64_to_cpu(ctx->attr->data.
+                                        non_resident.initialized_size)))) {
+                                ntfs_error(vi->i_sb, "Failed to load "
+                                                "attribute list attribute.");
+                                goto unm_err_out;
+                        }
+                } else /* if (!ctx.attr->non_resident) */ {
+                        if ((u8*)ctx->attr + le16_to_cpu(
+                                        ctx->attr->data.resident.value_offset) +
+                                        le32_to_cpu(
+                                        ctx->attr->data.resident.value_length) >
+                                        (u8*)ctx->mrec + vol->mft_record_size) {
+                                ntfs_error(vi->i_sb, "Corrupt attribute list "
+                                                "in inode.");
+                                goto unm_err_out;
+                        }
+                        /* Now copy the attribute list. */
+                        memcpy(ni->attr_list, (u8*)ctx->attr + le16_to_cpu(
+                                        ctx->attr->data.resident.value_offset),
+                                        le32_to_cpu(
+                                        ctx->attr->data.resident.value_length));
+                }
+        }
+skip_attr_list_load:
+        /*
+         * If an attribute list is present we now have the attribute list value
+         * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes.
+         */
+        if (S_ISDIR(vi->i_mode)) {
+                struct inode *bvi;
+                ntfs_inode *bni;
+                INDEX_ROOT *ir;
+                char *ir_end, *index_end;
+                /* It is a directory, find index root attribute. */
+                ntfs_attr_reinit_search_ctx(ctx);
+                err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE,
+                                0, NULL, 0, ctx);
+                if (unlikely(err)) {
+                        if (err == -ENOENT) {
+                                // FIXME: File is corrupt! Hot-fix with empty
+                                // index root attribute if recovery option is
+                                // set.
+                                ntfs_error(vi->i_sb, "$INDEX_ROOT attribute "
+                                                "is missing.");
+                        }
+                        goto unm_err_out;
+                }
+                /* Set up the state. */
+                if (unlikely(ctx->attr->non_resident)) {
+                        ntfs_error(vol->sb, "$INDEX_ROOT attribute is not "
+                                        "resident.");
+                        goto unm_err_out;
+                }
+                /* Ensure the attribute name is placed before the value. */
+                if (unlikely(ctx->attr->name_length &&
+                                (le16_to_cpu(ctx->attr->name_offset) >=
+                                le16_to_cpu(ctx->attr->data.resident.
+                                value_offset)))) {
+                        ntfs_error(vol->sb, "$INDEX_ROOT attribute name is "
+                                        "placed after the attribute value.");
+                        goto unm_err_out;
+                }
+                /*
+                 * Compressed/encrypted index root just means that the newly
+                 * created files in that directory should be created compressed/
+                 * encrypted. However index root cannot be both compressed and
+                 * encrypted.
+                 */
+                if (ctx->attr->flags & ATTR_COMPRESSION_MASK)
+                        NInoSetCompressed(ni);
+                if (ctx->attr->flags & ATTR_IS_ENCRYPTED) {
+                        if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
+                                ntfs_error(vi->i_sb, "Found encrypted and "
+                                                "compressed attribute.");
+                                goto unm_err_out;
+                        }
+                        NInoSetEncrypted(ni);
+                }
+                if (ctx->attr->flags & ATTR_IS_SPARSE)
+                        NInoSetSparse(ni);
+                ir = (INDEX_ROOT*)((char*)ctx->attr + le16_to_cpu(
+                                ctx->attr->data.resident.value_offset));
+                ir_end = (char*)ir + le32_to_cpu(
+                                ctx->attr->data.resident.value_length);
+                if (ir_end > (char*)ctx->mrec + vol->mft_record_size) {
+                        ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
+                                        "corrupt.");
+                        goto unm_err_out;
+                }
+                index_end = (char*)&ir->index +
+                                le32_to_cpu(ir->index.index_length);
+                if (index_end > ir_end) {
+                        ntfs_error(vi->i_sb, "Directory index is corrupt.");
+                        goto unm_err_out;
+                }
+                if (ir->type != AT_FILE_NAME) {
+                        ntfs_error(vi->i_sb, "Indexed attribute is not "
+                                        "$FILE_NAME.");
+                        goto unm_err_out;
+                }
+                if (ir->collation_rule != COLLATION_FILE_NAME) {
+                        ntfs_error(vi->i_sb, "Index collation rule is not "
+                                        "COLLATION_FILE_NAME.");
+                        goto unm_err_out;
+                }
+                ni->itype.index.collation_rule = ir->collation_rule;
+                ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
+                if (ni->itype.index.block_size &
+                                (ni->itype.index.block_size - 1)) {
+                        ntfs_error(vi->i_sb, "Index block size (%u) is not a "
+                                        "power of two.",
+                                        ni->itype.index.block_size);
+                        goto unm_err_out;
+                }
+                if (ni->itype.index.block_size > PAGE_CACHE_SIZE) {
+                        ntfs_error(vi->i_sb, "Index block size (%u) > "
+                                        "PAGE_CACHE_SIZE (%ld) is not "
+                                        "supported.  Sorry.",
+                                        ni->itype.index.block_size,
+                                        PAGE_CACHE_SIZE);
+                        err = -EOPNOTSUPP;
+                        goto unm_err_out;
+                }
+                if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
+                        ntfs_error(vi->i_sb, "Index block size (%u) < "
+                                        "NTFS_BLOCK_SIZE (%i) is not "
+                                        "supported.  Sorry.",
+                                        ni->itype.index.block_size,
+                                        NTFS_BLOCK_SIZE);
+                        err = -EOPNOTSUPP;
+                        goto unm_err_out;
+                }
+                ni->itype.index.block_size_bits =
+                                ffs(ni->itype.index.block_size) - 1;
+                /* Determine the size of a vcn in the directory index. */
+                if (vol->cluster_size <= ni->itype.index.block_size) {
+                        ni->itype.index.vcn_size = vol->cluster_size;
+                        ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
+                } else {
+                        ni->itype.index.vcn_size = vol->sector_size;
+                        ni->itype.index.vcn_size_bits = vol->sector_size_bits;
+                }
+                /* Setup the index allocation attribute, even if not present. */
+                NInoSetMstProtected(ni);
+                ni->type = AT_INDEX_ALLOCATION;
+                ni->name = I30;
+                ni->name_len = 4;
+                if (!(ir->index.flags & LARGE_INDEX)) {
+                        /* No index allocation. */
+                        vi->i_size = ni->initialized_size =
+                                        ni->allocated_size = 0;
+                        /* We are done with the mft record, so we release it. */
+                        ntfs_attr_put_search_ctx(ctx);
+                        unmap_mft_record(ni);
+                        m = NULL;
+                        ctx = NULL;
+                        goto skip_large_dir_stuff;
+                } /* LARGE_INDEX: Index allocation present. Setup state. */
+                NInoSetIndexAllocPresent(ni);
+                /* Find index allocation attribute. */
+                ntfs_attr_reinit_search_ctx(ctx);
+                err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, I30, 4,
+                                CASE_SENSITIVE, 0, NULL, 0, ctx);
+                if (unlikely(err)) {
+                        if (err == -ENOENT)
+                                ntfs_error(vi->i_sb, "$INDEX_ALLOCATION "
+                                                "attribute is not present but "
+                                                "$INDEX_ROOT indicated it is.");
+                        else
+                                ntfs_error(vi->i_sb, "Failed to lookup "
+                                                "$INDEX_ALLOCATION "
+                                                "attribute.");
+                        goto unm_err_out;
+                }
+                if (!ctx->attr->non_resident) {
+                        ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
+                                        "is resident.");
+                        goto unm_err_out;
+                }
+                /*
+                 * Ensure the attribute name is placed before the mapping pairs
+                 * array.
+                 */
+                if (unlikely(ctx->attr->name_length &&
+                                (le16_to_cpu(ctx->attr->name_offset) >=
+                                le16_to_cpu(ctx->attr->data.non_resident.
+                                mapping_pairs_offset)))) {
+                        ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name "
+                                        "is placed after the mapping pairs "
+                                        "array.");
+                        goto unm_err_out;
+                }
+                if (ctx->attr->flags & ATTR_IS_ENCRYPTED) {
+                        ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
+                                        "is encrypted.");
+                        goto unm_err_out;
+                }
+                if (ctx->attr->flags & ATTR_IS_SPARSE) {
+                        ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
+                                        "is sparse.");
+                        goto unm_err_out;
+                }
+                if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
+                        ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
+                                        "is compressed.");
+                        goto unm_err_out;
+                }
+                if (ctx->attr->data.non_resident.lowest_vcn) {
+                        ntfs_error(vi->i_sb, "First extent of "
+                                        "$INDEX_ALLOCATION attribute has non "
+                                        "zero lowest_vcn.");
+                        goto unm_err_out;
+                }
+                vi->i_size = sle64_to_cpu(
+                                ctx->attr->data.non_resident.data_size);
+                ni->initialized_size = sle64_to_cpu(
+                                ctx->attr->data.non_resident.initialized_size);
+                ni->allocated_size = sle64_to_cpu(
+                                ctx->attr->data.non_resident.allocated_size);
+                /*
+                 * We are done with the mft record, so we release it. Otherwise
+                 * we would deadlock in ntfs_attr_iget().
+                 */
+                ntfs_attr_put_search_ctx(ctx);
+                unmap_mft_record(ni);
+                m = NULL;
+                ctx = NULL;
+                /* Get the index bitmap attribute inode. */
+                bvi = ntfs_attr_iget(vi, AT_BITMAP, I30, 4);
+                if (IS_ERR(bvi)) {
+                        ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
+                        err = PTR_ERR(bvi);
+                        goto unm_err_out;
+                }
+                ni->itype.index.bmp_ino = bvi;
+                bni = NTFS_I(bvi);
+                if (NInoCompressed(bni) || NInoEncrypted(bni) ||
+                                NInoSparse(bni)) {
+                        ntfs_error(vi->i_sb, "$BITMAP attribute is compressed "
+                                        "and/or encrypted and/or sparse.");
+                        goto unm_err_out;
+                }
+                /* Consistency check bitmap size vs. index allocation size. */
+                if ((bvi->i_size << 3) < (vi->i_size >>
+                                ni->itype.index.block_size_bits)) {
+                        ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) "
+                                        "for index allocation (0x%llx).",
+                                        bvi->i_size << 3, vi->i_size);
+                        goto unm_err_out;
+                }
+skip_large_dir_stuff:
+                /* Setup the operations for this inode. */
+                vi->i_op = &ntfs_dir_inode_ops;
+                vi->i_fop = &ntfs_dir_ops;
+        } else {
+                /* It is a file. */
+                ntfs_attr_reinit_search_ctx(ctx);
+                /* Setup the data attribute, even if not present. */
+                ni->type = AT_DATA;
+                ni->name = NULL;
+                ni->name_len = 0;
+                /* Find first extent of the unnamed data attribute. */
+                err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, 0, NULL, 0, ctx);
+                if (unlikely(err)) {
+                        vi->i_size = ni->initialized_size =
+                                        ni->allocated_size = 0;
+                        if (err != -ENOENT) {
+                                ntfs_error(vi->i_sb, "Failed to lookup $DATA "
+                                                "attribute.");
+                                goto unm_err_out;
+                        }
+                        /*
+                         * FILE_Secure does not have an unnamed $DATA
+                         * attribute, so we special case it here.
+                         */
+                        if (vi->i_ino == FILE_Secure)
+                                goto no_data_attr_special_case;
+                        /*
+                         * Most if not all the system files in the $Extend
+                         * system directory do not have unnamed data
+                         * attributes so we need to check if the parent
+                         * directory of the file is FILE_Extend and if it is
+                         * ignore this error. To do this we need to get the
+                         * name of this inode from the mft record as the name
+                         * contains the back reference to the parent directory.
+                         */
+                        if (ntfs_is_extended_system_file(ctx) > 0)
+                                goto no_data_attr_special_case;
+                        // FIXME: File is corrupt! Hot-fix with empty data
+                        // attribute if recovery option is set.
+                        ntfs_error(vi->i_sb, "$DATA attribute is missing.");
+                        goto unm_err_out;
+                }
+                /* Setup the state. */
+                if (ctx->attr->non_resident) {
+                        NInoSetNonResident(ni);
+                        if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
+                                NInoSetCompressed(ni);
+                                if (vol->cluster_size > 4096) {
+                                        ntfs_error(vi->i_sb, "Found "
+                                                "compressed data but "
+                                                "compression is disabled due "
+                                                "to cluster size (%i) > 4kiB.",
+                                                vol->cluster_size);
+                                        goto unm_err_out;
+                                }
+                                if ((ctx->attr->flags & ATTR_COMPRESSION_MASK)
+                                                != ATTR_IS_COMPRESSED) {
+                                        ntfs_error(vi->i_sb, "Found "
+                                                "unknown compression method or "
+                                                "corrupt file.");
+                                        goto unm_err_out;
+                                }
+                                ni->itype.compressed.block_clusters = 1U <<
+                                                ctx->attr->data.non_resident.
+                                                compression_unit;
+                                if (ctx->attr->data.non_resident.
+                                                compression_unit != 4) {
+                                        ntfs_error(vi->i_sb, "Found "
+                                                "nonstandard compression unit "
+                                                "(%u instead of 4).  Cannot "
+                                                "handle this.",
+                                                ctx->attr->data.non_resident.
+                                                compression_unit);
+                                        err = -EOPNOTSUPP;
+                                        goto unm_err_out;
+                                }
+                                ni->itype.compressed.block_size = 1U << (
+                                                ctx->attr->data.non_resident.
+                                                compression_unit +
+                                                vol->cluster_size_bits);
+                                ni->itype.compressed.block_size_bits = ffs(
+                                        ni->itype.compressed.block_size) - 1;
+                        }
+                        if (ctx->attr->flags & ATTR_IS_ENCRYPTED) {
+                                if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
+                                        ntfs_error(vi->i_sb, "Found encrypted "
+                                                        "and compressed data.");
+                                        goto unm_err_out;
+                                }
+                                NInoSetEncrypted(ni);
+                        }
+                        if (ctx->attr->flags & ATTR_IS_SPARSE)
+                                NInoSetSparse(ni);
+                        if (ctx->attr->data.non_resident.lowest_vcn) {
+                                ntfs_error(vi->i_sb, "First extent of $DATA "
+                                                "attribute has non zero "
+                                                "lowest_vcn.");
+                                goto unm_err_out;
+                        }
+                        /* Setup all the sizes. */
+                        vi->i_size = sle64_to_cpu(
+                                        ctx->attr->data.non_resident.data_size);
+                        ni->initialized_size = sle64_to_cpu(
+                                        ctx->attr->data.non_resident.
+                                        initialized_size);
+                        ni->allocated_size = sle64_to_cpu(
+                                        ctx->attr->data.non_resident.
+                                        allocated_size);
+                        if (NInoCompressed(ni)) {
+                                ni->itype.compressed.size = sle64_to_cpu(
+                                                ctx->attr->data.non_resident.
+                                                compressed_size);
+                        }
+                } else { /* Resident attribute. */
+                        /*
+                         * Make all sizes equal for simplicity in read code
+                         * paths. FIXME: Need to keep this in mind when
+                         * converting to non-resident attribute in write code
+                         * path. (Probably only affects truncate().)
+                         */
+                        vi->i_size = ni->initialized_size = ni->allocated_size =
+                                        le32_to_cpu(
+                                        ctx->attr->data.resident.value_length);
+                }
+no_data_attr_special_case:
+                /* We are done with the mft record, so we release it. */
+                ntfs_attr_put_search_ctx(ctx);
+                unmap_mft_record(ni);
+                m = NULL;
+                ctx = NULL;
+                /* Setup the operations for this inode. */
+                vi->i_op = &ntfs_file_inode_ops;
+                vi->i_fop = &ntfs_file_ops;
+        }
+        if (NInoMstProtected(ni))
+                vi->i_mapping->a_ops = &ntfs_mst_aops;
+        else
+                vi->i_mapping->a_ops = &ntfs_aops;
+        /*
+         * The number of 512-byte blocks used on disk (for stat). This is in so
+         * far inaccurate as it doesn't account for any named streams or other
+         * special non-resident attributes, but that is how Windows works, too,
+         * so we are at least consistent with Windows, if not entirely
+         * consistent with the Linux Way. Doing it the Linux Way would cause a
+         * significant slowdown as it would involve iterating over all
+         * attributes in the mft record and adding the allocated/compressed
+         * sizes of all non-resident attributes present to give us the Linux
+         * correct size that should go into i_blocks (after division by 512).
+         */
+        if (S_ISDIR(vi->i_mode) || !NInoCompressed(ni))
+                vi->i_blocks = ni->allocated_size >> 9;
+        else
+                vi->i_blocks = ni->itype.compressed.size >> 9;
+        ntfs_debug("Done.");
+        return 0;
+unm_err_out:
+        if (!err)
+                err = -EIO;
+        if (ctx)
+                ntfs_attr_put_search_ctx(ctx);
+        if (m)
+                unmap_mft_record(ni);
+err_out:
+        ntfs_error(vol->sb, "Failed with error code %i.  Marking corrupt "
+                        "inode 0x%lx as bad.  Run chkdsk.", err, vi->i_ino);
+        make_bad_inode(vi);
+        if (err != -EOPNOTSUPP && err != -ENOMEM)
+                NVolSetErrors(vol);
+        return err;
+}
+/**
+ * ntfs_read_locked_attr_inode - read an attribute inode from its base inode
+ * @base_vi:    base inode
+ * @vi:         attribute inode to read
+ *
+ * ntfs_read_locked_attr_inode() is called from ntfs_attr_iget() to read the
+ * attribute inode described by @vi into memory from the base mft record
+ * described by @base_ni.
+ *
+ * ntfs_read_locked_attr_inode() maps, pins and locks the base inode for
+ * reading and looks up the attribute described by @vi before setting up the
+ * necessary fields in @vi as well as initializing the ntfs inode.
+ *
+ * Q: What locks are held when the function is called?
+ * A: i_state has I_LOCK set, hence the inode is locked, also
+ *    i_count is set to 1, so it is not going to go away
+ *
+ * Return 0 on success and -errno on error.  In the error case, the inode will
+ * have had make_bad_inode() executed on it.
+ */
+static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
+{
+        ntfs_volume *vol = NTFS_SB(vi->i_sb);
+        ntfs_inode *ni, *base_ni;
+        MFT_RECORD *m;
+        ntfs_attr_search_ctx *ctx;
+        int err = 0;
+        ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
+        ntfs_init_big_inode(vi);
+        ni      = NTFS_I(vi);
+        base_ni = NTFS_I(base_vi);
+        /* Just mirror the values from the base inode. */
+        vi->i_blksize   = base_vi->i_blksize;
+        vi->i_version   = base_vi->i_version;
+        vi->i_uid       = base_vi->i_uid;
+        vi->i_gid       = base_vi->i_gid;
+        vi->i_nlink     = base_vi->i_nlink;
+        vi->i_mtime     = base_vi->i_mtime;
+        vi->i_ctime     = base_vi->i_ctime;
+        vi->i_atime     = base_vi->i_atime;
+        vi->i_generation = ni->seq_no = base_ni->seq_no;
+        /* Set inode type to zero but preserve permissions. */
+        vi->i_mode      = base_vi->i_mode & ~S_IFMT;
+        m = map_mft_record(base_ni);
+        if (IS_ERR(m)) {
+                err = PTR_ERR(m);
+                goto err_out;
+        }
+        ctx = ntfs_attr_get_search_ctx(base_ni, m);
+        if (!ctx) {
+                err = -ENOMEM;
+                goto unm_err_out;
+        }
+        /* Find the attribute. */
+        err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+                        CASE_SENSITIVE, 0, NULL, 0, ctx);
+        if (unlikely(err))
+                goto unm_err_out;
+        if (!ctx->attr->non_resident) {
+                /* Ensure the attribute name is placed before the value. */
+                if (unlikely(ctx->attr->name_length &&
+                                (le16_to_cpu(ctx->attr->name_offset) >=
+                                le16_to_cpu(ctx->attr->data.resident.
+                                value_offset)))) {
+                        ntfs_error(vol->sb, "Attribute name is placed after "
+                                        "the attribute value.");
+                        goto unm_err_out;
+                }
+                if (NInoMstProtected(ni) || ctx->attr->flags) {
+                        ntfs_error(vi->i_sb, "Found mst protected attribute "
+                                        "or attribute with non-zero flags but "
+                                        "the attribute is resident.  Please "
+                                        "report you saw this message to "
+                                        "linux-ntfs-dev@lists.sourceforge.net");
+                        goto unm_err_out;
+                }
+                /*
+                 * Resident attribute. Make all sizes equal for simplicity in
+                 * read code paths.
+                 */
+                vi->i_size = ni->initialized_size = ni->allocated_size =
+                        le32_to_cpu(ctx->attr->data.resident.value_length);
+        } else {
+                NInoSetNonResident(ni);
+                /*
+                 * Ensure the attribute name is placed before the mapping pairs
+                 * array.
+                 */
+                if (unlikely(ctx->attr->name_length &&
+                                (le16_to_cpu(ctx->attr->name_offset) >=
+                                le16_to_cpu(ctx->attr->data.non_resident.
+                                mapping_pairs_offset)))) {
+                        ntfs_error(vol->sb, "Attribute name is placed after "
+                                        "the mapping pairs array.");
+                        goto unm_err_out;
+                }
+                if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
+                        if (NInoMstProtected(ni)) {
+                                ntfs_error(vi->i_sb, "Found mst protected "
+                                                "attribute but the attribute "
+                                                "is compressed.  Please report "
+                                                "you saw this message to "
+                                                "linux-ntfs-dev@lists."
+                                                "sourceforge.net");
+                                goto unm_err_out;
+                        }
+                        NInoSetCompressed(ni);
+                        if ((ni->type != AT_DATA) || (ni->type == AT_DATA &&
+                                        ni->name_len)) {
+                                ntfs_error(vi->i_sb, "Found compressed "
+                                                "non-data or named data "
+                                                "attribute.  Please report "
+                                                "you saw this message to "
+                                                "linux-ntfs-dev@lists."
+                                                "sourceforge.net");
+                                goto unm_err_out;
+                        }
+                        if (vol->cluster_size > 4096) {
+                                ntfs_error(vi->i_sb, "Found compressed "
+                                                "attribute but compression is "
+                                                "disabled due to cluster size "
+                                                "(%i) > 4kiB.",
+                                                vol->cluster_size);
+                                goto unm_err_out;
+                        }
+                        if ((ctx->attr->flags & ATTR_COMPRESSION_MASK)
+                                        != ATTR_IS_COMPRESSED) {
+                                ntfs_error(vi->i_sb, "Found unknown "
+                                                "compression method.");
+                                goto unm_err_out;
+                        }
+                        ni->itype.compressed.block_clusters = 1U <<
+                                        ctx->attr->data.non_resident.
+                                        compression_unit;
+                        if (ctx->attr->data.non_resident.compression_unit !=
+                                        4) {
+                                ntfs_error(vi->i_sb, "Found nonstandard "
+                                                "compression unit (%u instead "
+                                                "of 4).  Cannot handle this.",
+                                                ctx->attr->data.non_resident.
+                                                compression_unit);
+                                err = -EOPNOTSUPP;
+                                goto unm_err_out;
+                        }
+                        ni->itype.compressed.block_size = 1U << (
+                                        ctx->attr->data.non_resident.
+                                        compression_unit +
+                                        vol->cluster_size_bits);
+                        ni->itype.compressed.block_size_bits = ffs(
+                                ni->itype.compressed.block_size) - 1;
+                }
+                if (ctx->attr->flags & ATTR_IS_ENCRYPTED) {
+                        if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
+                                ntfs_error(vi->i_sb, "Found encrypted "
+                                                "and compressed data.");
+                                goto unm_err_out;
+                        }
+                        if (NInoMstProtected(ni)) {
+                                ntfs_error(vi->i_sb, "Found mst protected "
+                                                "attribute but the attribute "
+                                                "is encrypted.  Please report "
+                                                "you saw this message to "
+                                                "linux-ntfs-dev@lists."
+                                                "sourceforge.net");
+                                goto unm_err_out;
+                        }
+                        NInoSetEncrypted(ni);
+                }
+                if (ctx->attr->flags & ATTR_IS_SPARSE) {
+                        if (NInoMstProtected(ni)) {
+                                ntfs_error(vi->i_sb, "Found mst protected "
+                                                "attribute but the attribute "
+                                                "is sparse.  Please report "
+                                                "you saw this message to "
+                                                "linux-ntfs-dev@lists."
+                                                "sourceforge.net");
+                                goto unm_err_out;
+                        }
+                        NInoSetSparse(ni);
+                }
+                if (ctx->attr->data.non_resident.lowest_vcn) {
+                        ntfs_error(vi->i_sb, "First extent of attribute has "
+                                        "non-zero lowest_vcn.");
+                        goto unm_err_out;
+                }
+                /* Setup all the sizes. */
+                vi->i_size = sle64_to_cpu(
+                                ctx->attr->data.non_resident.data_size);
+                ni->initialized_size = sle64_to_cpu(
+                                ctx->attr->data.non_resident.initialized_size);
+                ni->allocated_size = sle64_to_cpu(
+                                ctx->attr->data.non_resident.allocated_size);
+                if (NInoCompressed(ni)) {
+                        ni->itype.compressed.size = sle64_to_cpu(
+                                        ctx->attr->data.non_resident.
+                                        compressed_size);
+                }
+        }
+        /* Setup the operations for this attribute inode. */
+        vi->i_op = NULL;
+        vi->i_fop = NULL;
+        if (NInoMstProtected(ni))
+                vi->i_mapping->a_ops = &ntfs_mst_aops;
+        else
+                vi->i_mapping->a_ops = &ntfs_aops;
+        if (!NInoCompressed(ni))
+                vi->i_blocks = ni->allocated_size >> 9;
+        else
+                vi->i_blocks = ni->itype.compressed.size >> 9;
+        /*
+         * Make sure the base inode doesn't go away and attach it to the
+         * attribute inode.
+         */
+        igrab(base_vi);
+        ni->ext.base_ntfs_ino = base_ni;
+        ni->nr_extents = -1;
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(base_ni);
+        ntfs_debug("Done.");
+        return 0;
+unm_err_out:
+        if (!err)
+                err = -EIO;
+        if (ctx)
+                ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(base_ni);
+err_out:
+        ntfs_error(vol->sb, "Failed with error code %i while reading attribute "
+                        "inode (mft_no 0x%lx, type 0x%x, name_len %i).  "
+                        "Marking corrupt inode and base inode 0x%lx as bad.  "
+                        "Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len,
+                        base_vi->i_ino);
+        make_bad_inode(vi);
+        make_bad_inode(base_vi);
+        if (err != -ENOMEM)
+                NVolSetErrors(vol);
+        return err;
+}
+/**
+ * ntfs_read_locked_index_inode - read an index inode from its base inode
+ * @base_vi:    base inode
+ * @vi:         index inode to read
+ *
+ * ntfs_read_locked_index_inode() is called from ntfs_index_iget() to read the
+ * index inode described by @vi into memory from the base mft record described
+ * by @base_ni.
+ *
+ * ntfs_read_locked_index_inode() maps, pins and locks the base inode for
+ * reading and looks up the attributes relating to the index described by @vi
+ * before setting up the necessary fields in @vi as well as initializing the
+ * ntfs inode.
+ *
+ * Note, index inodes are essentially attribute inodes (NInoAttr() is true)
+ * with the attribute type set to AT_INDEX_ALLOCATION.  Apart from that, they
+ * are setup like directory inodes since directories are a special case of
+ * indices ao they need to be treated in much the same way.  Most importantly,
+ * for small indices the index allocation attribute might not actually exist.
+ * However, the index root attribute always exists but this does not need to
+ * have an inode associated with it and this is why we define a new inode type
+ * index.  Also, like for directories, we need to have an attribute inode for
+ * the bitmap attribute corresponding to the index allocation attribute and we
+ * can store this in the appropriate field of the inode, just like we do for
+ * normal directory inodes.
+ *
+ * Q: What locks are held when the function is called?
+ * A: i_state has I_LOCK set, hence the inode is locked, also
+ *    i_count is set to 1, so it is not going to go away
+ *
+ * Return 0 on success and -errno on error.  In the error case, the inode will
+ * have had make_bad_inode() executed on it.
+ */
+static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
+{
+        ntfs_volume *vol = NTFS_SB(vi->i_sb);
+        ntfs_inode *ni, *base_ni, *bni;
+        struct inode *bvi;
+        MFT_RECORD *m;
+        ntfs_attr_search_ctx *ctx;
+        INDEX_ROOT *ir;
+        u8 *ir_end, *index_end;
+        int err = 0;
+        ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
+        ntfs_init_big_inode(vi);
+        ni      = NTFS_I(vi);
+        base_ni = NTFS_I(base_vi);
+        /* Just mirror the values from the base inode. */
+        vi->i_blksize   = base_vi->i_blksize;
+        vi->i_version   = base_vi->i_version;
+        vi->i_uid       = base_vi->i_uid;
+        vi->i_gid       = base_vi->i_gid;
+        vi->i_nlink     = base_vi->i_nlink;
+        vi->i_mtime     = base_vi->i_mtime;
+        vi->i_ctime     = base_vi->i_ctime;
+        vi->i_atime     = base_vi->i_atime;
+        vi->i_generation = ni->seq_no = base_ni->seq_no;
+        /* Set inode type to zero but preserve permissions. */
+        vi->i_mode      = base_vi->i_mode & ~S_IFMT;
+        /* Map the mft record for the base inode. */
+        m = map_mft_record(base_ni);
+        if (IS_ERR(m)) {
+                err = PTR_ERR(m);
+                goto err_out;
+        }
+        ctx = ntfs_attr_get_search_ctx(base_ni, m);
+        if (!ctx) {
+                err = -ENOMEM;
+                goto unm_err_out;
+        }
+        /* Find the index root attribute. */
+        err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len,
+                        CASE_SENSITIVE, 0, NULL, 0, ctx);
+        if (unlikely(err)) {
+                if (err == -ENOENT)
+                        ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
+                                        "missing.");
+                goto unm_err_out;
+        }
+        /* Set up the state. */
+        if (unlikely(ctx->attr->non_resident)) {
+                ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident.");
+                goto unm_err_out;
+        }
+        /* Ensure the attribute name is placed before the value. */
+        if (unlikely(ctx->attr->name_length &&
+                        (le16_to_cpu(ctx->attr->name_offset) >=
+                        le16_to_cpu(ctx->attr->data.resident.
+                        value_offset)))) {
+                ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed "
+                                "after the attribute value.");
+                goto unm_err_out;
+        }
+        /* Compressed/encrypted/sparse index root is not allowed. */
+        if (ctx->attr->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED |
+                        ATTR_IS_SPARSE)) {
+                ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index "
+                                "root attribute.");
+                goto unm_err_out;
+        }
+        ir = (INDEX_ROOT*)((u8*)ctx->attr +
+                        le16_to_cpu(ctx->attr->data.resident.value_offset));
+        ir_end = (u8*)ir + le32_to_cpu(ctx->attr->data.resident.value_length);
+        if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
+                ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt.");
+                goto unm_err_out;
+        }
+        index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+        if (index_end > ir_end) {
+                ntfs_error(vi->i_sb, "Index is corrupt.");
+                goto unm_err_out;
+        }
+        if (ir->type) {
+                ntfs_error(vi->i_sb, "Index type is not 0 (type is 0x%x).",
+                                le32_to_cpu(ir->type));
+                goto unm_err_out;
+        }
+        ni->itype.index.collation_rule = ir->collation_rule;
+        ntfs_debug("Index collation rule is 0x%x.",
+                        le32_to_cpu(ir->collation_rule));
+        ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
+        if (ni->itype.index.block_size & (ni->itype.index.block_size - 1)) {
+                ntfs_error(vi->i_sb, "Index block size (%u) is not a power of "
+                                "two.", ni->itype.index.block_size);
+                goto unm_err_out;
+        }
+        if (ni->itype.index.block_size > PAGE_CACHE_SIZE) {
+                ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_CACHE_SIZE "
+                                "(%ld) is not supported.  Sorry.",
+                                ni->itype.index.block_size, PAGE_CACHE_SIZE);
+                err = -EOPNOTSUPP;
+                goto unm_err_out;
+        }
+        if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
+                ntfs_error(vi->i_sb, "Index block size (%u) < NTFS_BLOCK_SIZE "
+                                "(%i) is not supported.  Sorry.",
+                                ni->itype.index.block_size, NTFS_BLOCK_SIZE);
+                err = -EOPNOTSUPP;
+                goto unm_err_out;
+        }
+        ni->itype.index.block_size_bits = ffs(ni->itype.index.block_size) - 1;
+        /* Determine the size of a vcn in the index. */
+        if (vol->cluster_size <= ni->itype.index.block_size) {
+                ni->itype.index.vcn_size = vol->cluster_size;
+                ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
+        } else {
+                ni->itype.index.vcn_size = vol->sector_size;
+                ni->itype.index.vcn_size_bits = vol->sector_size_bits;
+        }
+        /* Check for presence of index allocation attribute. */
+        if (!(ir->index.flags & LARGE_INDEX)) {
+                /* No index allocation. */
+                vi->i_size = ni->initialized_size = ni->allocated_size = 0;
+                /* We are done with the mft record, so we release it. */
+                ntfs_attr_put_search_ctx(ctx);
+                unmap_mft_record(base_ni);
+                m = NULL;
+                ctx = NULL;
+                goto skip_large_index_stuff;
+        } /* LARGE_INDEX:  Index allocation present.  Setup state. */
+        NInoSetIndexAllocPresent(ni);
+        /* Find index allocation attribute. */
+        ntfs_attr_reinit_search_ctx(ctx);
+        err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len,
+                        CASE_SENSITIVE, 0, NULL, 0, ctx);
+        if (unlikely(err)) {
+                if (err == -ENOENT)
+                        ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
+                                        "not present but $INDEX_ROOT "
+                                        "indicated it is.");
+                else
+                        ntfs_error(vi->i_sb, "Failed to lookup "
+                                        "$INDEX_ALLOCATION attribute.");
+                goto unm_err_out;
+        }
+        if (!ctx->attr->non_resident) {
+                ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
+                                "resident.");
+                goto unm_err_out;
+        }
+        /*
+         * Ensure the attribute name is placed before the mapping pairs array.
+         */
+        if (unlikely(ctx->attr->name_length && (le16_to_cpu(
+                        ctx->attr->name_offset) >= le16_to_cpu(
+                        ctx->attr->data.non_resident.mapping_pairs_offset)))) {
+                ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is "
+                                "placed after the mapping pairs array.");
+                goto unm_err_out;
+        }
+        if (ctx->attr->flags & ATTR_IS_ENCRYPTED) {
+                ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
+                                "encrypted.");
+                goto unm_err_out;
+        }
+        if (ctx->attr->flags & ATTR_IS_SPARSE) {
+                ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse.");
+                goto unm_err_out;
+        }
+        if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
+                ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
+                                "compressed.");
+                goto unm_err_out;
+        }
+        if (ctx->attr->data.non_resident.lowest_vcn) {
+                ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION "
+                                "attribute has non zero lowest_vcn.");
+                goto unm_err_out;
+        }
+        vi->i_size = sle64_to_cpu(ctx->attr->data.non_resident.data_size);
+        ni->initialized_size = sle64_to_cpu(
+                        ctx->attr->data.non_resident.initialized_size);
+        ni->allocated_size = sle64_to_cpu(
+                        ctx->attr->data.non_resident.allocated_size);
+        /*
+         * We are done with the mft record, so we release it.  Otherwise
+         * we would deadlock in ntfs_attr_iget().
+         */
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(base_ni);
+        m = NULL;
+        ctx = NULL;
+        /* Get the index bitmap attribute inode. */
+        bvi = ntfs_attr_iget(base_vi, AT_BITMAP, ni->name, ni->name_len);
+        if (IS_ERR(bvi)) {
+                ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
+                err = PTR_ERR(bvi);
+                goto unm_err_out;
+        }
+        bni = NTFS_I(bvi);
+        if (NInoCompressed(bni) || NInoEncrypted(bni) ||
+                        NInoSparse(bni)) {
+                ntfs_error(vi->i_sb, "$BITMAP attribute is compressed and/or "
+                                "encrypted and/or sparse.");
+                goto iput_unm_err_out;
+        }
+        /* Consistency check bitmap size vs. index allocation size. */
+        if ((bvi->i_size << 3) < (vi->i_size >>
+                        ni->itype.index.block_size_bits)) {
+                ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for "
+                                "index allocation (0x%llx).", bvi->i_size << 3,
+                                vi->i_size);
+                goto iput_unm_err_out;
+        }
+        ni->itype.index.bmp_ino = bvi;
+skip_large_index_stuff:
+        /* Setup the operations for this index inode. */
+        vi->i_op = NULL;
+        vi->i_fop = NULL;
+        vi->i_mapping->a_ops = &ntfs_mst_aops;
+        vi->i_blocks = ni->allocated_size >> 9;
+        /*
+         * Make sure the base inode doesn't go away and attach it to the
+         * index inode.
+         */
+        igrab(base_vi);
+        ni->ext.base_ntfs_ino = base_ni;
+        ni->nr_extents = -1;
+        ntfs_debug("Done.");
+        return 0;
+iput_unm_err_out:
+        iput(bvi);
+unm_err_out:
+        if (!err)
+                err = -EIO;
+        if (ctx)
+                ntfs_attr_put_search_ctx(ctx);
+        if (m)
+                unmap_mft_record(base_ni);
+err_out:
+        ntfs_error(vi->i_sb, "Failed with error code %i while reading index "
+                        "inode (mft_no 0x%lx, name_len %i.", err, vi->i_ino,
+                        ni->name_len);
+        make_bad_inode(vi);
+        if (err != -EOPNOTSUPP && err != -ENOMEM)
+                NVolSetErrors(vol);
+        return err;
+}
+/**
+ * ntfs_read_inode_mount - special read_inode for mount time use only
+ * @vi:         inode to read
+ *
+ * Read inode FILE_MFT at mount time, only called with super_block lock
+ * held from within the read_super() code path.
+ *
+ * This function exists because when it is called the page cache for $MFT/$DATA
+ * is not initialized and hence we cannot get at the contents of mft records
+ * by calling map_mft_record*().
+ *
+ * Further it needs to cope with the circular references problem, i.e. cannot
+ * load any attributes other than $ATTRIBUTE_LIST until $DATA is loaded, because
+ * we do not know where the other extent mft records are yet and again, because
+ * we cannot call map_mft_record*() yet.  Obviously this applies only when an
+ * attribute list is actually present in $MFT inode.
+ *
+ * We solve these problems by starting with the $DATA attribute before anything
+ * else and iterating using ntfs_attr_lookup($DATA) over all extents.  As each
+ * extent is found, we ntfs_mapping_pairs_decompress() including the implied
+ * ntfs_runlists_merge().  Each step of the iteration necessarily provides
+ * sufficient information for the next step to complete.
+ *
+ * This should work but there are two possible pit falls (see inline comments
+ * below), but only time will tell if they are real pits or just smoke...
+ */
+int ntfs_read_inode_mount(struct inode *vi)
+{
+        VCN next_vcn, last_vcn, highest_vcn;
+        s64 block;
+        struct super_block *sb = vi->i_sb;
+        ntfs_volume *vol = NTFS_SB(sb);
+        struct buffer_head *bh;
+        ntfs_inode *ni;
+        MFT_RECORD *m = NULL;
+        ATTR_RECORD *attr;
+        ntfs_attr_search_ctx *ctx;
+        unsigned int i, nr_blocks;
+        int err;
+        ntfs_debug("Entering.");
+        /* Initialize the ntfs specific part of @vi. */
+        ntfs_init_big_inode(vi);
+        ni = NTFS_I(vi);
+        /* Setup the data attribute. It is special as it is mst protected. */
+        NInoSetNonResident(ni);
+        NInoSetMstProtected(ni);
+        ni->type = AT_DATA;
+        ni->name = NULL;
+        ni->name_len = 0;
+        /*
+         * This sets up our little cheat allowing us to reuse the async read io
+         * completion handler for directories.
+         */
+        ni->itype.index.block_size = vol->mft_record_size;
+        ni->itype.index.block_size_bits = vol->mft_record_size_bits;
+        /* Very important! Needed to be able to call map_mft_record*(). */
+        vol->mft_ino = vi;
+        /* Allocate enough memory to read the first mft record. */
+        if (vol->mft_record_size > 64 * 1024) {
+                ntfs_error(sb, "Unsupported mft record size %i (max 64kiB).",
+                                vol->mft_record_size);
+                goto err_out;
+        }
+        i = vol->mft_record_size;
+        if (i < sb->s_blocksize)
+                i = sb->s_blocksize;
+        m = (MFT_RECORD*)ntfs_malloc_nofs(i);
+        if (!m) {
+                ntfs_error(sb, "Failed to allocate buffer for $MFT record 0.");
+                goto err_out;
+        }
+        /* Determine the first block of the $MFT/$DATA attribute. */
+        block = vol->mft_lcn << vol->cluster_size_bits >>
+                        sb->s_blocksize_bits;
+        nr_blocks = vol->mft_record_size >> sb->s_blocksize_bits;
+        if (!nr_blocks)
+                nr_blocks = 1;
+        /* Load $MFT/$DATA's first mft record. */
+        for (i = 0; i < nr_blocks; i++) {
+                bh = sb_bread(sb, block++);
+                if (!bh) {
+                        ntfs_error(sb, "Device read failed.");
+                        goto err_out;
+                }
+                memcpy((char*)m + (i << sb->s_blocksize_bits), bh->b_data,
+                                sb->s_blocksize);
+                brelse(bh);
+        }
+        /* Apply the mst fixups. */
+        if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) {
+                /* FIXME: Try to use the $MFTMirr now. */
+                ntfs_error(sb, "MST fixup failed. $MFT is corrupt.");
+                goto err_out;
+        }
+        /* Need this to sanity check attribute list references to $MFT. */
+        vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
+        /* Provides readpage() and sync_page() for map_mft_record(). */
+        vi->i_mapping->a_ops = &ntfs_mst_aops;
+        ctx = ntfs_attr_get_search_ctx(ni, m);
+        if (!ctx) {
+                err = -ENOMEM;
+                goto err_out;
+        }
+        /* Find the attribute list attribute if present. */
+        err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
+        if (err) {
+                if (unlikely(err != -ENOENT)) {
+                        ntfs_error(sb, "Failed to lookup attribute list "
+                                        "attribute. You should run chkdsk.");
+                        goto put_err_out;
+                }
+        } else /* if (!err) */ {
+                ATTR_LIST_ENTRY *al_entry, *next_al_entry;
+                u8 *al_end;
+                ntfs_debug("Attribute list attribute found in $MFT.");
+                NInoSetAttrList(ni);
+                if (ctx->attr->flags & ATTR_IS_ENCRYPTED ||
+                                ctx->attr->flags & ATTR_COMPRESSION_MASK ||
+                                ctx->attr->flags & ATTR_IS_SPARSE) {
+                        ntfs_error(sb, "Attribute list attribute is "
+                                        "compressed/encrypted/sparse. Not "
+                                        "allowed. $MFT is corrupt. You should "
+                                        "run chkdsk.");
+                        goto put_err_out;
+                }
+                /* Now allocate memory for the attribute list. */
+                ni->attr_list_size = (u32)ntfs_attr_size(ctx->attr);
+                ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
+                if (!ni->attr_list) {
+                        ntfs_error(sb, "Not enough memory to allocate buffer "
+                                        "for attribute list.");
+                        goto put_err_out;
+                }
+                if (ctx->attr->non_resident) {
+                        NInoSetAttrListNonResident(ni);
+                        if (ctx->attr->data.non_resident.lowest_vcn) {
+                                ntfs_error(sb, "Attribute list has non zero "
+                                                "lowest_vcn. $MFT is corrupt. "
+                                                "You should run chkdsk.");
+                                goto put_err_out;
+                        }
+                        /* Setup the runlist. */
+                        ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
+                                        ctx->attr, NULL);
+                        if (IS_ERR(ni->attr_list_rl.rl)) {
+                                err = PTR_ERR(ni->attr_list_rl.rl);
+                                ni->attr_list_rl.rl = NULL;
+                                ntfs_error(sb, "Mapping pairs decompression "
+                                                "failed with error code %i.",
+                                                -err);
+                                goto put_err_out;
+                        }
+                        /* Now load the attribute list. */
+                        if ((err = load_attribute_list(vol, &ni->attr_list_rl,
+                                        ni->attr_list, ni->attr_list_size,
+                                        sle64_to_cpu(ctx->attr->data.
+                                        non_resident.initialized_size)))) {
+                                ntfs_error(sb, "Failed to load attribute list "
+                                                "attribute with error code %i.",
+                                                -err);
+                                goto put_err_out;
+                        }
+                } else /* if (!ctx.attr->non_resident) */ {
+                        if ((u8*)ctx->attr + le16_to_cpu(
+                                        ctx->attr->data.resident.value_offset) +
+                                        le32_to_cpu(
+                                        ctx->attr->data.resident.value_length) >
+                                        (u8*)ctx->mrec + vol->mft_record_size) {
+                                ntfs_error(sb, "Corrupt attribute list "
+                                                "attribute.");
+                                goto put_err_out;
+                        }
+                        /* Now copy the attribute list. */
+                        memcpy(ni->attr_list, (u8*)ctx->attr + le16_to_cpu(
+                                        ctx->attr->data.resident.value_offset),
+                                        le32_to_cpu(
+                                        ctx->attr->data.resident.value_length));
+                }
+                /* The attribute list is now setup in memory. */
+                /*
+                 * FIXME: I don't know if this case is actually possible.
+                 * According to logic it is not possible but I have seen too
+                 * many weird things in MS software to rely on logic... Thus we
+                 * perform a manual search and make sure the first $MFT/$DATA
+                 * extent is in the base inode. If it is not we abort with an
+                 * error and if we ever see a report of this error we will need
+                 * to do some magic in order to have the necessary mft record
+                 * loaded and in the right place in the page cache. But
+                 * hopefully logic will prevail and this never happens...
+                 */
+                al_entry = (ATTR_LIST_ENTRY*)ni->attr_list;
+                al_end = (u8*)al_entry + ni->attr_list_size;
+                for (;; al_entry = next_al_entry) {
+                        /* Out of bounds check. */
+                        if ((u8*)al_entry < ni->attr_list ||
+                                        (u8*)al_entry > al_end)
+                                goto em_put_err_out;
+                        /* Catch the end of the attribute list. */
+                        if ((u8*)al_entry == al_end)
+                                goto em_put_err_out;
+                        if (!al_entry->length)
+                                goto em_put_err_out;
+                        if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
+                                        le16_to_cpu(al_entry->length) > al_end)
+                                goto em_put_err_out;
+                        next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
+                                        le16_to_cpu(al_entry->length));
+                        if (le32_to_cpu(al_entry->type) >
+                                        const_le32_to_cpu(AT_DATA))
+                                goto em_put_err_out;
+                        if (AT_DATA != al_entry->type)
+                                continue;
+                        /* We want an unnamed attribute. */
+                        if (al_entry->name_length)
+                                goto em_put_err_out;
+                        /* Want the first entry, i.e. lowest_vcn == 0. */
+                        if (al_entry->lowest_vcn)
+                                goto em_put_err_out;
+                        /* First entry has to be in the base mft record. */
+                        if (MREF_LE(al_entry->mft_reference) != vi->i_ino) {
+                                /* MFT references do not match, logic fails. */
+                                ntfs_error(sb, "BUG: The first $DATA extent "
+                                                "of $MFT is not in the base "
+                                                "mft record. Please report "
+                                                "you saw this message to "
+                                                "linux-ntfs-dev@lists."
+                                                "sourceforge.net");
+                                goto put_err_out;
+                        } else {
+                                /* Sequence numbers must match. */
+                                if (MSEQNO_LE(al_entry->mft_reference) !=
+                                                ni->seq_no)
+                                        goto em_put_err_out;
+                                /* Got it. All is ok. We can stop now. */
+                                break;
+                        }
+                }
+        }
+        ntfs_attr_reinit_search_ctx(ctx);
+        /* Now load all attribute extents. */
+        attr = NULL;
+        next_vcn = last_vcn = highest_vcn = 0;
+        while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0,
+                        ctx))) {
+                runlist_element *nrl;
+                /* Cache the current attribute. */
+                attr = ctx->attr;
+                /* $MFT must be non-resident. */
+                if (!attr->non_resident) {
+                        ntfs_error(sb, "$MFT must be non-resident but a "
+                                        "resident extent was found. $MFT is "
+                                        "corrupt. Run chkdsk.");
+                        goto put_err_out;
+                }
+                /* $MFT must be uncompressed and unencrypted. */
+                if (attr->flags & ATTR_COMPRESSION_MASK ||
+                                attr->flags & ATTR_IS_ENCRYPTED ||
+                                attr->flags & ATTR_IS_SPARSE) {
+                        ntfs_error(sb, "$MFT must be uncompressed, "
+                                        "non-sparse, and unencrypted but a "
+                                        "compressed/sparse/encrypted extent "
+                                        "was found. $MFT is corrupt. Run "
+                                        "chkdsk.");
+                        goto put_err_out;
+                }
+                /*
+                 * Decompress the mapping pairs array of this extent and merge
+                 * the result into the existing runlist. No need for locking
+                 * as we have exclusive access to the inode at this time and we
+                 * are a mount in progress task, too.
+                 */
+                nrl = ntfs_mapping_pairs_decompress(vol, attr, ni->runlist.rl);
+                if (IS_ERR(nrl)) {
+                        ntfs_error(sb, "ntfs_mapping_pairs_decompress() "
+                                        "failed with error code %ld.  $MFT is "
+                                        "corrupt.", PTR_ERR(nrl));
+                        goto put_err_out;
+                }
+                ni->runlist.rl = nrl;
+                /* Are we in the first extent? */
+                if (!next_vcn) {
+                        if (attr->data.non_resident.lowest_vcn) {
+                                ntfs_error(sb, "First extent of $DATA "
+                                                "attribute has non zero "
+                                                "lowest_vcn. $MFT is corrupt. "
+                                                "You should run chkdsk.");
+                                goto put_err_out;
+                        }
+                        /* Get the last vcn in the $DATA attribute. */
+                        last_vcn = sle64_to_cpu(
+                                        attr->data.non_resident.allocated_size)
+                                        >> vol->cluster_size_bits;
+                        /* Fill in the inode size. */
+                        vi->i_size = sle64_to_cpu(
+                                        attr->data.non_resident.data_size);
+                        ni->initialized_size = sle64_to_cpu(attr->data.
+                                        non_resident.initialized_size);
+                        ni->allocated_size = sle64_to_cpu(
+                                        attr->data.non_resident.allocated_size);
+                        /*
+                         * Verify the number of mft records does not exceed
+                         * 2^32 - 1.
+                         */
+                        if ((vi->i_size >> vol->mft_record_size_bits) >=
+                                        (1ULL << 32)) {
+                                ntfs_error(sb, "$MFT is too big! Aborting.");
+                                goto put_err_out;
+                        }
+                        /*
+                         * We have got the first extent of the runlist for
+                         * $MFT which means it is now relatively safe to call
+                         * the normal ntfs_read_inode() function.
+                         * Complete reading the inode, this will actually
+                         * re-read the mft record for $MFT, this time entering
+                         * it into the page cache with which we complete the
+                         * kick start of the volume. It should be safe to do
+                         * this now as the first extent of $MFT/$DATA is
+                         * already known and we would hope that we don't need
+                         * further extents in order to find the other
+                         * attributes belonging to $MFT. Only time will tell if
+                         * this is really the case. If not we will have to play
+                         * magic at this point, possibly duplicating a lot of
+                         * ntfs_read_inode() at this point. We will need to
+                         * ensure we do enough of its work to be able to call
+                         * ntfs_read_inode() on extents of $MFT/$DATA. But lets
+                         * hope this never happens...
+                         */
+                        ntfs_read_locked_inode(vi);
+                        if (is_bad_inode(vi)) {
+                                ntfs_error(sb, "ntfs_read_inode() of $MFT "
+                                                "failed. BUG or corrupt $MFT. "
+                                                "Run chkdsk and if no errors "
+                                                "are found, please report you "
+                                                "saw this message to "
+                                                "linux-ntfs-dev@lists."
+                                                "sourceforge.net");
+                                ntfs_attr_put_search_ctx(ctx);
+                                /* Revert to the safe super operations. */
+                                ntfs_free(m);
+                                return -1;
+                        }
+                        /*
+                         * Re-initialize some specifics about $MFT's inode as
+                         * ntfs_read_inode() will have set up the default ones.
+                         */
+                        /* Set uid and gid to root. */
+                        vi->i_uid = vi->i_gid = 0;
+                        /* Regular file. No access for anyone. */
+                        vi->i_mode = S_IFREG;
+                        /* No VFS initiated operations allowed for $MFT. */
+                        vi->i_op = &ntfs_empty_inode_ops;
+                        vi->i_fop = &ntfs_empty_file_ops;
+                }
+                /* Get the lowest vcn for the next extent. */
+                highest_vcn = sle64_to_cpu(attr->data.non_resident.highest_vcn);
+                next_vcn = highest_vcn + 1;
+                /* Only one extent or error, which we catch below. */
+                if (next_vcn <= 0)
+                        break;
+                /* Avoid endless loops due to corruption. */
+                if (next_vcn < sle64_to_cpu(
+                                attr->data.non_resident.lowest_vcn)) {
+                        ntfs_error(sb, "$MFT has corrupt attribute list "
+                                        "attribute. Run chkdsk.");
+                        goto put_err_out;
+                }
+        }
+        if (err != -ENOENT) {
+                ntfs_error(sb, "Failed to lookup $MFT/$DATA attribute extent. "
+                                "$MFT is corrupt. Run chkdsk.");
+                goto put_err_out;
+        }
+        if (!attr) {
+                ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is "
+                                "corrupt. Run chkdsk.");
+                goto put_err_out;
+        }
+        if (highest_vcn && highest_vcn != last_vcn - 1) {
+                ntfs_error(sb, "Failed to load the complete runlist for "
+                                "$MFT/$DATA. Driver bug or corrupt $MFT. "
+                                "Run chkdsk.");
+                ntfs_debug("highest_vcn = 0x%llx, last_vcn - 1 = 0x%llx",
+                                (unsigned long long)highest_vcn,
+                                (unsigned long long)last_vcn - 1);
+                goto put_err_out;
+        }
+        ntfs_attr_put_search_ctx(ctx);
+        ntfs_debug("Done.");
+        ntfs_free(m);
+        return 0;
+em_put_err_out:
+        ntfs_error(sb, "Couldn't find first extent of $DATA attribute in "
+                        "attribute list. $MFT is corrupt. Run chkdsk.");
+put_err_out:
+        ntfs_attr_put_search_ctx(ctx);
+err_out:
+        ntfs_error(sb, "Failed. Marking inode as bad.");
+        make_bad_inode(vi);
+        ntfs_free(m);
+        return -1;
+}
+/**
+ * ntfs_put_inode - handler for when the inode reference count is decremented
+ * @vi:         vfs inode
+ *
+ * The VFS calls ntfs_put_inode() every time the inode reference count (i_count)
+ * is about to be decremented (but before the decrement itself.
+ *
+ * If the inode @vi is a directory with two references, one of which is being
+ * dropped, we need to put the attribute inode for the directory index bitmap,
+ * if it is present, otherwise the directory inode would remain pinned for
+ * ever.
+ */
+void ntfs_put_inode(struct inode *vi)
+{
+        if (S_ISDIR(vi->i_mode) && atomic_read(&vi->i_count) == 2) {
+                ntfs_inode *ni = NTFS_I(vi);
+                if (NInoIndexAllocPresent(ni)) {
+                        struct inode *bvi = NULL;
+                        down(&vi->i_sem);
+                        if (atomic_read(&vi->i_count) == 2) {
+                                bvi = ni->itype.index.bmp_ino;
+                                if (bvi)
+                                        ni->itype.index.bmp_ino = NULL;
+                        }
+                        up(&vi->i_sem);
+                        if (bvi)
+                                iput(bvi);
+                }
+        }
+}
+static void __ntfs_clear_inode(ntfs_inode *ni)
+{
+        /* Free all alocated memory. */
+        down_write(&ni->runlist.lock);
+        if (ni->runlist.rl) {
+                ntfs_free(ni->runlist.rl);
+                ni->runlist.rl = NULL;
+        }
+        up_write(&ni->runlist.lock);
+        if (ni->attr_list) {
+                ntfs_free(ni->attr_list);
+                ni->attr_list = NULL;
+        }
+        down_write(&ni->attr_list_rl.lock);
+        if (ni->attr_list_rl.rl) {
+                ntfs_free(ni->attr_list_rl.rl);
+                ni->attr_list_rl.rl = NULL;
+        }
+        up_write(&ni->attr_list_rl.lock);
+        if (ni->name_len && ni->name != I30) {
+                /* Catch bugs... */
+                BUG_ON(!ni->name);
+                kfree(ni->name);
+        }
+}
+void ntfs_clear_extent_inode(ntfs_inode *ni)
+{
+        ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
+        BUG_ON(NInoAttr(ni));
+        BUG_ON(ni->nr_extents != -1);
+#ifdef NTFS_RW
+        if (NInoDirty(ni)) {
+                if (!is_bad_inode(VFS_I(ni->ext.base_ntfs_ino)))
+                        ntfs_error(ni->vol->sb, "Clearing dirty extent inode!  "
+                                        "Losing data!  This is a BUG!!!");
+                // FIXME:  Do something!!!
+        }
+#endif /* NTFS_RW */
+        __ntfs_clear_inode(ni);
+        /* Bye, bye... */
+        ntfs_destroy_extent_inode(ni);
+}
+/**
+ * ntfs_clear_big_inode - clean up the ntfs specific part of an inode
+ * @vi:         vfs inode pending annihilation
+ *
+ * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode()
+ * is called, which deallocates all memory belonging to the NTFS specific part
+ * of the inode and returns.
+ *
+ * If the MFT record is dirty, we commit it before doing anything else.
+ */
+void ntfs_clear_big_inode(struct inode *vi)
+{
+        ntfs_inode *ni = NTFS_I(vi);
+        /*
+         * If the inode @vi is an index inode we need to put the attribute
+         * inode for the index bitmap, if it is present, otherwise the index
+         * inode would disappear and the attribute inode for the index bitmap
+         * would no longer be referenced from anywhere and thus it would remain
+         * pinned for ever.
+         */
+        if (NInoAttr(ni) && (ni->type == AT_INDEX_ALLOCATION) &&
+                        NInoIndexAllocPresent(ni) && ni->itype.index.bmp_ino) {
+                iput(ni->itype.index.bmp_ino);
+                ni->itype.index.bmp_ino = NULL;
+        }
+#ifdef NTFS_RW
+        if (NInoDirty(ni)) {
+                BOOL was_bad = (is_bad_inode(vi));
+                /* Committing the inode also commits all extent inodes. */
+                ntfs_commit_inode(vi);
+                if (!was_bad && (is_bad_inode(vi) || NInoDirty(ni))) {
+                        ntfs_error(vi->i_sb, "Failed to commit dirty inode "
+                                        "0x%lx.  Losing data!", vi->i_ino);
+                        // FIXME:  Do something!!!
+                }
+        }
+#endif /* NTFS_RW */
+        /* No need to lock at this stage as no one else has a reference. */
+        if (ni->nr_extents > 0) {
+                int i;
+                for (i = 0; i < ni->nr_extents; i++)
+                        ntfs_clear_extent_inode(ni->ext.extent_ntfs_inos[i]);
+                kfree(ni->ext.extent_ntfs_inos);
+        }
+        __ntfs_clear_inode(ni);
+        if (NInoAttr(ni)) {
+                /* Release the base inode if we are holding it. */
+                if (ni->nr_extents == -1) {
+                        iput(VFS_I(ni->ext.base_ntfs_ino));
+                        ni->nr_extents = 0;
+                        ni->ext.base_ntfs_ino = NULL;
+                }
+        }
+        return;
+}
+/**
+ * ntfs_show_options - show mount options in /proc/mounts
+ * @sf:         seq_file in which to write our mount options
+ * @mnt:        vfs mount whose mount options to display
+ *
+ * Called by the VFS once for each mounted ntfs volume when someone reads
+ * /proc/mounts in order to display the NTFS specific mount options of each
+ * mount. The mount options of the vfs mount @mnt are written to the seq file
+ * @sf and success is returned.
+ */
+int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt)
+{
+        ntfs_volume *vol = NTFS_SB(mnt->mnt_sb);
+        int i;
+        seq_printf(sf, ",uid=%i", vol->uid);
+        seq_printf(sf, ",gid=%i", vol->gid);
+        if (vol->fmask == vol->dmask)
+                seq_printf(sf, ",umask=0%o", vol->fmask);
+        else {
+                seq_printf(sf, ",fmask=0%o", vol->fmask);
+                seq_printf(sf, ",dmask=0%o", vol->dmask);
+        }
+        seq_printf(sf, ",nls=%s", vol->nls_map->charset);
+        if (NVolCaseSensitive(vol))
+                seq_printf(sf, ",case_sensitive");
+        if (NVolShowSystemFiles(vol))
+                seq_printf(sf, ",show_sys_files");
+        for (i = 0; on_errors_arr[i].val; i++) {
+                if (on_errors_arr[i].val & vol->on_errors)
+                        seq_printf(sf, ",errors=%s", on_errors_arr[i].str);
+        }
+        seq_printf(sf, ",mft_zone_multiplier=%i", vol->mft_zone_multiplier);
+        return 0;
+}
+#ifdef NTFS_RW
+/**
+ * ntfs_truncate - called when the i_size of an ntfs inode is changed
+ * @vi:         inode for which the i_size was changed
+ *
+ * We do not support i_size changes yet.
+ *
+ * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and
+ * that the change is allowed.
+ *
+ * This implies for us that @vi is a file inode rather than a directory, index,
+ * or attribute inode as well as that @vi is a base inode.
+ *
+ * Returns 0 on success or -errno on error.
+ *
+ * Called with ->i_sem held.  In all but one case ->i_alloc_sem is held for
+ * writing.  The only case where ->i_alloc_sem is not held is
+ * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called
+ * with the current i_size as the offset which means that it is a noop as far
+ * as ntfs_truncate() is concerned.
+ */
+int ntfs_truncate(struct inode *vi)
+{
+        ntfs_inode *ni = NTFS_I(vi);
+        ntfs_volume *vol = ni->vol;
+        ntfs_attr_search_ctx *ctx;
+        MFT_RECORD *m;
+        const char *te = "  Leaving file length out of sync with i_size.";
+        int err;
+        ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+        BUG_ON(NInoAttr(ni));
+        BUG_ON(ni->nr_extents < 0);
+        m = map_mft_record(ni);
+        if (IS_ERR(m)) {
+                err = PTR_ERR(m);
+                ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx "
+                                "(error code %d).%s", vi->i_ino, err, te);
+                ctx = NULL;
+                m = NULL;
+                goto err_out;
+        }
+        ctx = ntfs_attr_get_search_ctx(ni, m);
+        if (unlikely(!ctx)) {
+                ntfs_error(vi->i_sb, "Failed to allocate a search context for "
+                                "inode 0x%lx (not enough memory).%s",
+                                vi->i_ino, te);
+                err = -ENOMEM;
+                goto err_out;
+        }
+        err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+                        CASE_SENSITIVE, 0, NULL, 0, ctx);
+        if (unlikely(err)) {
+                if (err == -ENOENT)
+                        ntfs_error(vi->i_sb, "Open attribute is missing from "
+                                        "mft record.  Inode 0x%lx is corrupt.  "
+                                        "Run chkdsk.", vi->i_ino);
+                else
+                        ntfs_error(vi->i_sb, "Failed to lookup attribute in "
+                                        "inode 0x%lx (error code %d).",
+                                        vi->i_ino, err);
+                goto err_out;
+        }
+        /* If the size has not changed there is nothing to do. */
+        if (ntfs_attr_size(ctx->attr) == i_size_read(vi))
+                goto done;
+        // TODO: Implement the truncate...
+        ntfs_error(vi->i_sb, "Inode size has changed but this is not "
+                        "implemented yet.  Resetting inode size to old value. "
+                        " This is most likely a bug in the ntfs driver!");
+        i_size_write(vi, ntfs_attr_size(ctx->attr)); 
+done:
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(ni);
+        NInoClearTruncateFailed(ni);
+        ntfs_debug("Done.");
+        return 0;
+err_out:
+        if (err != -ENOMEM) {
+                NVolSetErrors(vol);
+                make_bad_inode(vi);
+        }
+        if (ctx)
+                ntfs_attr_put_search_ctx(ctx);
+        if (m)
+                unmap_mft_record(ni);
+        NInoSetTruncateFailed(ni);
+        return err;
+}
+/**
+ * ntfs_truncate_vfs - wrapper for ntfs_truncate() that has no return value
+ * @vi:         inode for which the i_size was changed
+ *
+ * Wrapper for ntfs_truncate() that has no return value.
+ *
+ * See ntfs_truncate() description above for details.
+ */
+void ntfs_truncate_vfs(struct inode *vi) {
+        ntfs_truncate(vi);
+}
+/**
+ * ntfs_setattr - called from notify_change() when an attribute is being changed
+ * @dentry:     dentry whose attributes to change
+ * @attr:       structure describing the attributes and the changes
+ *
+ * We have to trap VFS attempts to truncate the file described by @dentry as
+ * soon as possible, because we do not implement changes in i_size yet.  So we
+ * abort all i_size changes here.
+ *
+ * We also abort all changes of user, group, and mode as we do not implement
+ * the NTFS ACLs yet.
+ *
+ * Called with ->i_sem held.  For the ATTR_SIZE (i.e. ->truncate) case, also
+ * called with ->i_alloc_sem held for writing.
+ *
+ * Basically this is a copy of generic notify_change() and inode_setattr()
+ * functionality, except we intercept and abort changes in i_size.
+ */
+int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *vi = dentry->d_inode;
+        int err;
+        unsigned int ia_valid = attr->ia_valid;
+        err = inode_change_ok(vi, attr);
+        if (err)
+                return err;
+        /* We do not support NTFS ACLs yet. */
+        if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) {
+                ntfs_warning(vi->i_sb, "Changes in user/group/mode are not "
+                                "supported yet, ignoring.");
+                err = -EOPNOTSUPP;
+                goto out;
+        }
+        if (ia_valid & ATTR_SIZE) {
+                if (attr->ia_size != i_size_read(vi)) {
+                        ntfs_warning(vi->i_sb, "Changes in inode size are not "
+                                        "supported yet, ignoring.");
+                        err = -EOPNOTSUPP;
+                        // TODO: Implement...
+                        // err = vmtruncate(vi, attr->ia_size);
+                        if (err || ia_valid == ATTR_SIZE)
+                                goto out;
+                } else {
+                        /*
+                         * We skipped the truncate but must still update
+                         * timestamps.
+                         */
+                        ia_valid |= ATTR_MTIME|ATTR_CTIME;
+                }
+        }
+        if (ia_valid & ATTR_ATIME)
+                vi->i_atime = attr->ia_atime;
+        if (ia_valid & ATTR_MTIME)
+                vi->i_mtime = attr->ia_mtime;
+        if (ia_valid & ATTR_CTIME)
+                vi->i_ctime = attr->ia_ctime;
+        mark_inode_dirty(vi);
+out:
+        return err;
+}
+/**
+ * ntfs_write_inode - write out a dirty inode
+ * @vi:         inode to write out
+ * @sync:       if true, write out synchronously
+ *
+ * Write out a dirty inode to disk including any extent inodes if present.
+ *
+ * If @sync is true, commit the inode to disk and wait for io completion.  This
+ * is done using write_mft_record().
+ *
+ * If @sync is false, just schedule the write to happen but do not wait for i/o
+ * completion.  In 2.6 kernels, scheduling usually happens just by virtue of
+ * marking the page (and in this case mft record) dirty but we do not implement
+ * this yet as write_mft_record() largely ignores the @sync parameter and
+ * always performs synchronous writes.
+ *
+ * Return 0 on success and -errno on error.
+ */
+int ntfs_write_inode(struct inode *vi, int sync)
+{
+        sle64 nt;
+        ntfs_inode *ni = NTFS_I(vi);
+        ntfs_attr_search_ctx *ctx;
+        MFT_RECORD *m;
+        STANDARD_INFORMATION *si;
+        int err = 0;
+        BOOL modified = FALSE;
+        ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "",
+                        vi->i_ino);
+        /*
+         * Dirty attribute inodes are written via their real inodes so just
+         * clean them here.  Access time updates are taken care off when the
+         * real inode is written.
+         */
+        if (NInoAttr(ni)) {
+                NInoClearDirty(ni);
+                ntfs_debug("Done.");
+                return 0;
+        }
+        /* Map, pin, and lock the mft record belonging to the inode. */
+        m = map_mft_record(ni);
+        if (IS_ERR(m)) {
+                err = PTR_ERR(m);
+                goto err_out;
+        }
+        /* Update the access times in the standard information attribute. */
+        ctx = ntfs_attr_get_search_ctx(ni, m);
+        if (unlikely(!ctx)) {
+                err = -ENOMEM;
+                goto unm_err_out;
+        }
+        err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0,
+                        CASE_SENSITIVE, 0, NULL, 0, ctx);
+        if (unlikely(err)) {
+                ntfs_attr_put_search_ctx(ctx);
+                goto unm_err_out;
+        }
+        si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
+                        le16_to_cpu(ctx->attr->data.resident.value_offset));
+        /* Update the access times if they have changed. */
+        nt = utc2ntfs(vi->i_mtime);
+        if (si->last_data_change_time != nt) {
+                ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
+                                "new = 0x%llx", vi->i_ino,
+                                sle64_to_cpu(si->last_data_change_time),
+                                sle64_to_cpu(nt));
+                si->last_data_change_time = nt;
+                modified = TRUE;
+        }
+        nt = utc2ntfs(vi->i_ctime);
+        if (si->last_mft_change_time != nt) {
+                ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
+                                "new = 0x%llx", vi->i_ino,
+                                sle64_to_cpu(si->last_mft_change_time),
+                                sle64_to_cpu(nt));
+                si->last_mft_change_time = nt;
+                modified = TRUE;
+        }
+        nt = utc2ntfs(vi->i_atime);
+        if (si->last_access_time != nt) {
+                ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
+                                "new = 0x%llx", vi->i_ino,
+                                sle64_to_cpu(si->last_access_time),
+                                sle64_to_cpu(nt));
+                si->last_access_time = nt;
+                modified = TRUE;
+        }
+        /*
+         * If we just modified the standard information attribute we need to
+         * mark the mft record it is in dirty.  We do this manually so that
+         * mark_inode_dirty() is not called which would redirty the inode and
+         * hence result in an infinite loop of trying to write the inode.
+         * There is no need to mark the base inode nor the base mft record
+         * dirty, since we are going to write this mft record below in any case
+         * and the base mft record may actually not have been modified so it
+         * might not need to be written out.
+         * NOTE: It is not a problem when the inode for $MFT itself is being
+         * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES
+         * on the $MFT inode and hence ntfs_write_inode() will not be
+         * re-invoked because of it which in turn is ok since the dirtied mft
+         * record will be cleaned and written out to disk below, i.e. before
+         * this function returns.
+         */
+        if (modified && !NInoTestSetDirty(ctx->ntfs_ino))
+                mark_ntfs_record_dirty(ctx->ntfs_ino->page,
+                                ctx->ntfs_ino->page_ofs);
+        ntfs_attr_put_search_ctx(ctx);
+        /* Now the access times are updated, write the base mft record. */
+        if (NInoDirty(ni))
+                err = write_mft_record(ni, m, sync);
+        /* Write all attached extent mft records. */
+        down(&ni->extent_lock);
+        if (ni->nr_extents > 0) {
+                ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos;
+                int i;
+                ntfs_debug("Writing %i extent inodes.", ni->nr_extents);
+                for (i = 0; i < ni->nr_extents; i++) {
+                        ntfs_inode *tni = extent_nis[i];
+                        if (NInoDirty(tni)) {
+                                MFT_RECORD *tm = map_mft_record(tni);
+                                int ret;
+                                if (IS_ERR(tm)) {
+                                        if (!err || err == -ENOMEM)
+                                                err = PTR_ERR(tm);
+                                        continue;
+                                }
+                                ret = write_mft_record(tni, tm, sync);
+                                unmap_mft_record(tni);
+                                if (unlikely(ret)) {
+                                        if (!err || err == -ENOMEM)
+                                                err = ret;
+                                }
+                        }
+                }
+        }
+        up(&ni->extent_lock);
+        unmap_mft_record(ni);
+        if (unlikely(err))
+                goto err_out;
+        ntfs_debug("Done.");
+        return 0;
+unm_err_out:
+        unmap_mft_record(ni);
+err_out:
+        if (err == -ENOMEM) {
+                ntfs_warning(vi->i_sb, "Not enough memory to write inode.  "
+                                "Marking the inode dirty again, so the VFS "
+                                "retries later.");
+                mark_inode_dirty(vi);
+        } else {
+                ntfs_error(vi->i_sb, "Failed (error code %i):  Marking inode "
+                                "as bad.  You should run chkdsk.", -err);
+                make_bad_inode(vi);
+                NVolSetErrors(ni->vol);
+        }
+        return err;
+}
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
new file mode 100644
index 000000000000..99580455f2ed
--- /dev/null
+++ b/fs/ntfs/inode.h
@@ -0,0 +1,321 @@
+/*
+ * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of
+ *           the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_INODE_H
+#define _LINUX_NTFS_INODE_H
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+#include "layout.h"
+#include "volume.h"
+#include "types.h"
+#include "runlist.h"
+#include "debug.h"
+typedef struct _ntfs_inode ntfs_inode;
+/*
+ * The NTFS in-memory inode structure. It is just used as an extension to the
+ * fields already provided in the VFS inode.
+ */
+struct _ntfs_inode {
+        s64 initialized_size;   /* Copy from the attribute record. */
+        s64 allocated_size;     /* Copy from the attribute record. */
+        unsigned long state;    /* NTFS specific flags describing this inode.
+                                   See ntfs_inode_state_bits below. */
+        unsigned long mft_no;   /* Number of the mft record / inode. */
+        u16 seq_no;             /* Sequence number of the mft record. */
+        atomic_t count;         /* Inode reference count for book keeping. */
+        ntfs_volume *vol;       /* Pointer to the ntfs volume of this inode. */
+        /*
+         * If NInoAttr() is true, the below fields describe the attribute which
+         * this fake inode belongs to. The actual inode of this attribute is
+         * pointed to by base_ntfs_ino and nr_extents is always set to -1 (see
+         * below). For real inodes, we also set the type (AT_DATA for files and
+         * AT_INDEX_ALLOCATION for directories), with the name = NULL and
+         * name_len = 0 for files and name = I30 (global constant) and
+         * name_len = 4 for directories.
+         */
+        ATTR_TYPE type; /* Attribute type of this fake inode. */
+        ntfschar *name;         /* Attribute name of this fake inode. */
+        u32 name_len;           /* Attribute name length of this fake inode. */
+        runlist runlist;        /* If state has the NI_NonResident bit set,
+                                   the runlist of the unnamed data attribute
+                                   (if a file) or of the index allocation
+                                   attribute (directory) or of the attribute
+                                   described by the fake inode (if NInoAttr()).
+                                   If runlist.rl is NULL, the runlist has not
+                                   been read in yet or has been unmapped. If
+                                   NI_NonResident is clear, the attribute is
+                                   resident (file and fake inode) or there is
+                                   no $I30 index allocation attribute
+                                   (small directory). In the latter case
+                                   runlist.rl is always NULL.*/
+        /*
+         * The following fields are only valid for real inodes and extent
+         * inodes.
+         */
+        struct semaphore mrec_lock; /* Lock for serializing access to the
+                                   mft record belonging to this inode. */
+        struct page *page;      /* The page containing the mft record of the
+                                   inode. This should only be touched by the
+                                   (un)map_mft_record*() functions. */
+        int page_ofs;           /* Offset into the page at which the mft record
+                                   begins. This should only be touched by the
+                                   (un)map_mft_record*() functions. */
+        /*
+         * Attribute list support (only for use by the attribute lookup
+         * functions). Setup during read_inode for all inodes with attribute
+         * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is
+         * further only valid if NI_AttrListNonResident is set.
+         */
+        u32 attr_list_size;     /* Length of attribute list value in bytes. */
+        u8 *attr_list;          /* Attribute list value itself. */
+        runlist attr_list_rl;   /* Run list for the attribute list value. */
+        union {
+                struct { /* It is a directory, $MFT, or an index inode. */
+                        struct inode *bmp_ino;  /* Attribute inode for the
+                                                   index $BITMAP. */
+                        u32 block_size;         /* Size of an index block. */
+                        u32 vcn_size;           /* Size of a vcn in this
+                                                   index. */
+                        COLLATION_RULE collation_rule; /* The collation rule
+                                                   for the index. */
+                        u8 block_size_bits;     /* Log2 of the above. */
+                        u8 vcn_size_bits;       /* Log2 of the above. */
+                } index;
+                struct { /* It is a compressed file or an attribute inode. */
+                        s64 size;               /* Copy of compressed_size from
+                                                   $DATA. */
+                        u32 block_size;         /* Size of a compression block
+                                                   (cb). */
+                        u8 block_size_bits;     /* Log2 of the size of a cb. */
+                        u8 block_clusters;      /* Number of clusters per cb. */
+                } compressed;
+        } itype;
+        struct semaphore extent_lock;   /* Lock for accessing/modifying the
+                                           below . */
+        s32 nr_extents; /* For a base mft record, the number of attached extent
+                           inodes (0 if none), for extent records and for fake
+                           inodes describing an attribute this is -1. */
+        union {         /* This union is only used if nr_extents != 0. */
+                ntfs_inode **extent_ntfs_inos;  /* For nr_extents > 0, array of
+                                                   the ntfs inodes of the extent
+                                                   mft records belonging to
+                                                   this base inode which have
+                                                   been loaded. */
+                ntfs_inode *base_ntfs_ino;      /* For nr_extents == -1, the
+                                                   ntfs inode of the base mft
+                                                   record. For fake inodes, the
+                                                   real (base) inode to which
+                                                   the attribute belongs. */
+        } ext;
+};
+/*
+ * Defined bits for the state field in the ntfs_inode structure.
+ * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only
+ */
+typedef enum {
+        NI_Dirty,               /* 1: Mft record needs to be written to disk. */
+        NI_AttrList,            /* 1: Mft record contains an attribute list. */
+        NI_AttrListNonResident, /* 1: Attribute list is non-resident. Implies
+                                      NI_AttrList is set. */
+        NI_Attr,                /* 1: Fake inode for attribute i/o.
+                                   0: Real inode or extent inode. */
+        NI_MstProtected,        /* 1: Attribute is protected by MST fixups.
+                                   0: Attribute is not protected by fixups. */
+        NI_NonResident,         /* 1: Unnamed data attr is non-resident (f).
+                                   1: Attribute is non-resident (a). */
+        NI_IndexAllocPresent = NI_NonResident,  /* 1: $I30 index alloc attr is
+                                                   present (d). */
+        NI_Compressed,          /* 1: Unnamed data attr is compressed (f).
+                                   1: Create compressed files by default (d).
+                                   1: Attribute is compressed (a). */
+        NI_Encrypted,           /* 1: Unnamed data attr is encrypted (f).
+                                   1: Create encrypted files by default (d).
+                                   1: Attribute is encrypted (a). */
+        NI_Sparse,              /* 1: Unnamed data attr is sparse (f).
+                                   1: Create sparse files by default (d).
+                                   1: Attribute is sparse (a). */
+        NI_TruncateFailed,      /* 1: Last ntfs_truncate() call failed. */
+} ntfs_inode_state_bits;
+/*
+ * NOTE: We should be adding dirty mft records to a list somewhere and they
+ * should be independent of the (ntfs/vfs) inode structure so that an inode can
+ * be removed but the record can be left dirty for syncing later.
+ */
+/*
+ * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo()
+ * functions.
+ */
+#define NINO_FNS(flag)                                  \
+static inline int NIno##flag(ntfs_inode *ni)            \
+{                                                       \
+        return test_bit(NI_##flag, &(ni)->state);       \
+}                                                       \
+static inline void NInoSet##flag(ntfs_inode *ni)        \
+{                                                       \
+        set_bit(NI_##flag, &(ni)->state);               \
+}                                                       \
+static inline void NInoClear##flag(ntfs_inode *ni)      \
+{                                                       \
+        clear_bit(NI_##flag, &(ni)->state);             \
+}
+/*
+ * As above for NInoTestSetFoo() and NInoTestClearFoo().
+ */
+#define TAS_NINO_FNS(flag)                                      \
+static inline int NInoTestSet##flag(ntfs_inode *ni)             \
+{                                                               \
+        return test_and_set_bit(NI_##flag, &(ni)->state);       \
+}                                                               \
+static inline int NInoTestClear##flag(ntfs_inode *ni)           \
+{                                                               \
+        return test_and_clear_bit(NI_##flag, &(ni)->state);     \
+}
+/* Emit the ntfs inode bitops functions. */
+NINO_FNS(Dirty)
+TAS_NINO_FNS(Dirty)
+NINO_FNS(AttrList)
+NINO_FNS(AttrListNonResident)
+NINO_FNS(Attr)
+NINO_FNS(MstProtected)
+NINO_FNS(NonResident)
+NINO_FNS(IndexAllocPresent)
+NINO_FNS(Compressed)
+NINO_FNS(Encrypted)
+NINO_FNS(Sparse)
+NINO_FNS(TruncateFailed)
+/*
+ * The full structure containing a ntfs_inode and a vfs struct inode. Used for
+ * all real and fake inodes but not for extent inodes which lack the vfs struct
+ * inode.
+ */
+typedef struct {
+        ntfs_inode ntfs_inode;
+        struct inode vfs_inode;         /* The vfs inode structure. */
+} big_ntfs_inode;
+/**
+ * NTFS_I - return the ntfs inode given a vfs inode
+ * @inode:      VFS inode
+ *
+ * NTFS_I() returns the ntfs inode associated with the VFS @inode.
+ */
+static inline ntfs_inode *NTFS_I(struct inode *inode)
+{
+        return (ntfs_inode *)list_entry(inode, big_ntfs_inode, vfs_inode);
+}
+static inline struct inode *VFS_I(ntfs_inode *ni)
+{
+        return &((big_ntfs_inode *)ni)->vfs_inode;
+}
+/**
+ * ntfs_attr - ntfs in memory attribute structure
+ * @mft_no:     mft record number of the base mft record of this attribute
+ * @name:       Unicode name of the attribute (NULL if unnamed)
+ * @name_len:   length of @name in Unicode characters (0 if unnamed)
+ * @type:       attribute type (see layout.h)
+ *
+ * This structure exists only to provide a small structure for the
+ * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism.
+ *
+ * NOTE: Elements are ordered by size to make the structure as compact as
+ * possible on all architectures.
+ */
+typedef struct {
+        unsigned long mft_no;
+        ntfschar *name;
+        u32 name_len;
+        ATTR_TYPE type;
+} ntfs_attr;
+typedef int (*test_t)(struct inode *, void *);
+extern int ntfs_test_inode(struct inode *vi, ntfs_attr *na);
+extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no);
+extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
+                ntfschar *name, u32 name_len);
+extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
+                u32 name_len);
+extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
+extern void ntfs_destroy_big_inode(struct inode *inode);
+extern void ntfs_clear_big_inode(struct inode *vi);
+extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);
+static inline void ntfs_init_big_inode(struct inode *vi)
+{
+        ntfs_inode *ni = NTFS_I(vi);
+        ntfs_debug("Entering.");
+        __ntfs_init_inode(vi->i_sb, ni);
+        ni->mft_no = vi->i_ino;
+}
+extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
+                unsigned long mft_no);
+extern void ntfs_clear_extent_inode(ntfs_inode *ni);
+extern int ntfs_read_inode_mount(struct inode *vi);
+extern void ntfs_put_inode(struct inode *vi);
+extern int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt);
+#ifdef NTFS_RW
+extern int ntfs_truncate(struct inode *vi);
+extern void ntfs_truncate_vfs(struct inode *vi);
+extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr);
+extern int ntfs_write_inode(struct inode *vi, int sync);
+static inline void ntfs_commit_inode(struct inode *vi)
+{
+        if (!is_bad_inode(vi))
+                ntfs_write_inode(vi, 1);
+        return;
+}
+#endif /* NTFS_RW */
+#endif /* _LINUX_NTFS_INODE_H */
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
new file mode 100644
index 000000000000..47b338999921
--- /dev/null
+++ b/fs/ntfs/layout.h
@@ -0,0 +1,2413 @@
+/*
+ * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS
+ *            project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_LAYOUT_H
+#define _LINUX_NTFS_LAYOUT_H
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+#include <asm/byteorder.h>
+#include "types.h"
+/*
+ * Constant endianness conversion defines.
+ */
+#define const_le16_to_cpu(x)    __constant_le16_to_cpu(x)
+#define const_le32_to_cpu(x)    __constant_le32_to_cpu(x)
+#define const_le64_to_cpu(x)    __constant_le64_to_cpu(x)
+#define const_cpu_to_le16(x)    __constant_cpu_to_le16(x)
+#define const_cpu_to_le32(x)    __constant_cpu_to_le32(x)
+#define const_cpu_to_le64(x)    __constant_cpu_to_le64(x)
+/* The NTFS oem_id "NTFS    " */
+#define magicNTFS       const_cpu_to_le64(0x202020205346544eULL)
+/*
+ * Location of bootsector on partition:
+ *      The standard NTFS_BOOT_SECTOR is on sector 0 of the partition.
+ *      On NT4 and above there is one backup copy of the boot sector to
+ *      be found on the last sector of the partition (not normally accessible
+ *      from within Windows as the bootsector contained number of sectors
+ *      value is one less than the actual value!).
+ *      On versions of NT 3.51 and earlier, the backup copy was located at
+ *      number of sectors/2 (integer divide), i.e. in the middle of the volume.
+ */
+/*
+ * BIOS parameter block (bpb) structure.
+ */
+typedef struct {
+        le16 bytes_per_sector;          /* Size of a sector in bytes. */
+        u8  sectors_per_cluster;        /* Size of a cluster in sectors. */
+        le16 reserved_sectors;          /* zero */
+        u8  fats;                       /* zero */
+        le16 root_entries;              /* zero */
+        le16 sectors;                   /* zero */
+        u8  media_type;                 /* 0xf8 = hard disk */
+        le16 sectors_per_fat;           /* zero */
+        le16 sectors_per_track;         /* irrelevant */
+        le16 heads;                     /* irrelevant */
+        le32 hidden_sectors;            /* zero */
+        le32 large_sectors;             /* zero */
+} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK;
+/*
+ * NTFS boot sector structure.
+ */
+typedef struct {
+        u8  jump[3];                    /* Irrelevant (jump to boot up code).*/
+        le64 oem_id;                    /* Magic "NTFS    ". */
+        BIOS_PARAMETER_BLOCK bpb;       /* See BIOS_PARAMETER_BLOCK. */
+        u8  unused[4];                  /* zero, NTFS diskedit.exe states that
+                                           this is actually:
+                                                __u8 physical_drive;    // 0x80
+                                                __u8 current_head;      // zero
+                                                __u8 extended_boot_signature;
+                                                                        // 0x80
+                                                __u8 unused;            // zero
+                                         */
+/*0x28*/sle64 number_of_sectors;        /* Number of sectors in volume. Gives
+                                           maximum volume size of 2^63 sectors.
+                                           Assuming standard sector size of 512
+                                           bytes, the maximum byte size is
+                                           approx. 4.7x10^21 bytes. (-; */
+        sle64 mft_lcn;                  /* Cluster location of mft data. */
+        sle64 mftmirr_lcn;              /* Cluster location of copy of mft. */
+        s8  clusters_per_mft_record;    /* Mft record size in clusters. */
+        u8  reserved0[3];               /* zero */
+        s8  clusters_per_index_record;  /* Index block size in clusters. */
+        u8  reserved1[3];               /* zero */
+        le64 volume_serial_number;      /* Irrelevant (serial number). */
+        le32 checksum;                  /* Boot sector checksum. */
+/*0x54*/u8  bootstrap[426];             /* Irrelevant (boot up code). */
+        le16 end_of_sector_marker;      /* End of bootsector magic. Always is
+                                           0xaa55 in little endian. */
+/* sizeof() = 512 (0x200) bytes */
+} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR;
+/*
+ * Magic identifiers present at the beginning of all ntfs record containing
+ * records (like mft records for example).
+ */
+enum {
+        /* Found in $MFT/$DATA. */
+        magic_FILE = const_cpu_to_le32(0x454c4946), /* Mft entry. */
+        magic_INDX = const_cpu_to_le32(0x58444e49), /* Index buffer. */
+        magic_HOLE = const_cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
+        /* Found in $LogFile/$DATA. */
+        magic_RSTR = const_cpu_to_le32(0x52545352), /* Restart page. */
+        magic_RCRD = const_cpu_to_le32(0x44524352), /* Log record page. */
+        /* Found in $LogFile/$DATA.  (May be found in $MFT/$DATA, also?) */
+        magic_CHKD = const_cpu_to_le32(0x424b4843), /* Modified by chkdsk. */
+        /* Found in all ntfs record containing records. */
+        magic_BAAD = const_cpu_to_le32(0x44414142), /* Failed multi sector
+                                                       transfer was detected. */
+        /*
+         * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
+         * thus not initialized.  Page must be initialized before using it.
+         */
+        magic_empty = const_cpu_to_le32(0xffffffff) /* Record is empty. */
+};
+typedef le32 NTFS_RECORD_TYPE;
+/*
+ * Generic magic comparison macros. Finally found a use for the ## preprocessor
+ * operator! (-8
+ */
+static inline BOOL __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r)
+{
+        return (x == r);
+}
+#define ntfs_is_magic(x, m)     __ntfs_is_magic(x, magic_##m)
+static inline BOOL __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r)
+{
+        return (*p == r);
+}
+#define ntfs_is_magicp(p, m)    __ntfs_is_magicp(p, magic_##m)
+/*
+ * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above.
+ */
+#define ntfs_is_file_record(x)          ( ntfs_is_magic (x, FILE) )
+#define ntfs_is_file_recordp(p)         ( ntfs_is_magicp(p, FILE) )
+#define ntfs_is_mft_record(x)           ( ntfs_is_file_record (x) )
+#define ntfs_is_mft_recordp(p)          ( ntfs_is_file_recordp(p) )
+#define ntfs_is_indx_record(x)          ( ntfs_is_magic (x, INDX) )
+#define ntfs_is_indx_recordp(p)         ( ntfs_is_magicp(p, INDX) )
+#define ntfs_is_hole_record(x)          ( ntfs_is_magic (x, HOLE) )
+#define ntfs_is_hole_recordp(p)         ( ntfs_is_magicp(p, HOLE) )
+#define ntfs_is_rstr_record(x)          ( ntfs_is_magic (x, RSTR) )
+#define ntfs_is_rstr_recordp(p)         ( ntfs_is_magicp(p, RSTR) )
+#define ntfs_is_rcrd_record(x)          ( ntfs_is_magic (x, RCRD) )
+#define ntfs_is_rcrd_recordp(p)         ( ntfs_is_magicp(p, RCRD) )
+#define ntfs_is_chkd_record(x)          ( ntfs_is_magic (x, CHKD) )
+#define ntfs_is_chkd_recordp(p)         ( ntfs_is_magicp(p, CHKD) )
+#define ntfs_is_baad_record(x)          ( ntfs_is_magic (x, BAAD) )
+#define ntfs_is_baad_recordp(p)         ( ntfs_is_magicp(p, BAAD) )
+#define ntfs_is_empty_record(x)         ( ntfs_is_magic (x, empty) )
+#define ntfs_is_empty_recordp(p)        ( ntfs_is_magicp(p, empty) )
+/*
+ * The Update Sequence Array (usa) is an array of the le16 values which belong
+ * to the end of each sector protected by the update sequence record in which
+ * this array is contained. Note that the first entry is the Update Sequence
+ * Number (usn), a cyclic counter of how many times the protected record has
+ * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All
+ * last le16's of each sector have to be equal to the usn (during reading) or
+ * are set to it (during writing). If they are not, an incomplete multi sector
+ * transfer has occurred when the data was written.
+ * The maximum size for the update sequence array is fixed to:
+ *      maximum size = usa_ofs + (usa_count * 2) = 510 bytes
+ * The 510 bytes comes from the fact that the last le16 in the array has to
+ * (obviously) finish before the last le16 of the first 512-byte sector.
+ * This formula can be used as a consistency check in that usa_ofs +
+ * (usa_count * 2) has to be less than or equal to 510.
+ */
+typedef struct {
+        NTFS_RECORD_TYPE magic; /* A four-byte magic identifying the record
+                                   type and/or status. */
+        le16 usa_ofs;           /* Offset to the Update Sequence Array (usa)
+                                   from the start of the ntfs record. */
+        le16 usa_count;         /* Number of le16 sized entries in the usa
+                                   including the Update Sequence Number (usn),
+                                   thus the number of fixups is the usa_count
+                                   minus 1. */
+} __attribute__ ((__packed__)) NTFS_RECORD;
+/*
+ * System files mft record numbers. All these files are always marked as used
+ * in the bitmap attribute of the mft; presumably in order to avoid accidental
+ * allocation for random other mft records. Also, the sequence number for each
+ * of the system files is always equal to their mft record number and it is
+ * never modified.
+ */
+typedef enum {
+        FILE_MFT       = 0,     /* Master file table (mft). Data attribute
+                                   contains the entries and bitmap attribute
+                                   records which ones are in use (bit==1). */
+        FILE_MFTMirr   = 1,     /* Mft mirror: copy of first four mft records
+                                   in data attribute. If cluster size > 4kiB,
+                                   copy of first N mft records, with
+                                        N = cluster_size / mft_record_size. */
+        FILE_LogFile   = 2,     /* Journalling log in data attribute. */
+        FILE_Volume    = 3,     /* Volume name attribute and volume information
+                                   attribute (flags and ntfs version). Windows
+                                   refers to this file as volume DASD (Direct
+                                   Access Storage Device). */
+        FILE_AttrDef   = 4,     /* Array of attribute definitions in data
+                                   attribute. */
+        FILE_root      = 5,     /* Root directory. */
+        FILE_Bitmap    = 6,     /* Allocation bitmap of all clusters (lcns) in
+                                   data attribute. */
+        FILE_Boot      = 7,     /* Boot sector (always at cluster 0) in data
+                                   attribute. */
+        FILE_BadClus   = 8,     /* Contains all bad clusters in the non-resident
+                                   data attribute. */
+        FILE_Secure    = 9,     /* Shared security descriptors in data attribute
+                                   and two indexes into the descriptors.
+                                   Appeared in Windows 2000. Before that, this
+                                   file was named $Quota but was unused. */
+        FILE_UpCase    = 10,    /* Uppercase equivalents of all 65536 Unicode
+                                   characters in data attribute. */
+        FILE_Extend    = 11,    /* Directory containing other system files (eg.
+                                   $ObjId, $Quota, $Reparse and $UsnJrnl). This
+                                   is new to NTFS3.0. */
+        FILE_reserved12 = 12,   /* Reserved for future use (records 12-15). */
+        FILE_reserved13 = 13,
+        FILE_reserved14 = 14,
+        FILE_reserved15 = 15,
+        FILE_first_user = 16,   /* First user file, used as test limit for
+                                   whether to allow opening a file or not. */
+} NTFS_SYSTEM_FILES;
+/*
+ * These are the so far known MFT_RECORD_* flags (16-bit) which contain
+ * information about the mft record in which they are present.
+ */
+enum {
+        MFT_RECORD_IN_USE       = const_cpu_to_le16(0x0001),
+        MFT_RECORD_IS_DIRECTORY = const_cpu_to_le16(0x0002),
+} __attribute__ ((__packed__));
+typedef le16 MFT_RECORD_FLAGS;
+/*
+ * mft references (aka file references or file record segment references) are
+ * used whenever a structure needs to refer to a record in the mft.
+ *
+ * A reference consists of a 48-bit index into the mft and a 16-bit sequence
+ * number used to detect stale references.
+ *
+ * For error reporting purposes we treat the 48-bit index as a signed quantity.
+ *
+ * The sequence number is a circular counter (skipping 0) describing how many
+ * times the referenced mft record has been (re)used. This has to match the
+ * sequence number of the mft record being referenced, otherwise the reference
+ * is considered stale and removed (FIXME: only ntfsck or the driver itself?).
+ *
+ * If the sequence number is zero it is assumed that no sequence number
+ * consistency checking should be performed.
+ *
+ * FIXME: Since inodes are 32-bit as of now, the driver needs to always check
+ * for high_part being 0 and if not either BUG(), cause a panic() or handle
+ * the situation in some other way. This shouldn't be a problem as a volume has
+ * to become HUGE in order to need more than 32-bits worth of mft records.
+ * Assuming the standard mft record size of 1kb only the records (never mind
+ * the non-resident attributes, etc.) would require 4Tb of space on their own
+ * for the first 32 bits worth of records. This is only if some strange person
+ * doesn't decide to foul play and make the mft sparse which would be a really
+ * horrible thing to do as it would trash our current driver implementation. )-:
+ * Do I hear screams "we want 64-bit inodes!" ?!? (-;
+ *
+ * FIXME: The mft zone is defined as the first 12% of the volume. This space is
+ * reserved so that the mft can grow contiguously and hence doesn't become
+ * fragmented. Volume free space includes the empty part of the mft zone and
+ * when the volume's free 88% are used up, the mft zone is shrunk by a factor
+ * of 2, thus making more space available for more files/data. This process is
+ * repeated everytime there is no more free space except for the mft zone until
+ * there really is no more free space.
+ */
+/*
+ * Typedef the MFT_REF as a 64-bit value for easier handling.
+ * Also define two unpacking macros to get to the reference (MREF) and
+ * sequence number (MSEQNO) respectively.
+ * The _LE versions are to be applied on little endian MFT_REFs.
+ * Note: The _LE versions will return a CPU endian formatted value!
+ */
+typedef enum {
+        MFT_REF_MASK_CPU        = 0x0000ffffffffffffULL,
+        MFT_REF_MASK_LE         = const_cpu_to_le64(0x0000ffffffffffffULL),
+} MFT_REF_CONSTS;
+typedef u64 MFT_REF;
+typedef le64 leMFT_REF;
+#define MK_MREF(m, s)   ((MFT_REF)(((MFT_REF)(s) << 48) |               \
+                                        ((MFT_REF)(m) & MFT_REF_MASK_CPU)))
+#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s))
+#define MREF(x)         ((unsigned long)((x) & MFT_REF_MASK_CPU))
+#define MSEQNO(x)       ((u16)(((x) >> 48) & 0xffff))
+#define MREF_LE(x)      ((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU))
+#define MSEQNO_LE(x)    ((u16)((le64_to_cpu(x) >> 48) & 0xffff))
+#define IS_ERR_MREF(x)  (((x) & 0x0000800000000000ULL) ? 1 : 0)
+#define ERR_MREF(x)     ((u64)((s64)(x)))
+#define MREF_ERR(x)     ((int)((s64)(x)))
+/*
+ * The mft record header present at the beginning of every record in the mft.
+ * This is followed by a sequence of variable length attribute records which
+ * is terminated by an attribute of type AT_END which is a truncated attribute
+ * in that it only consists of the attribute type code AT_END and none of the
+ * other members of the attribute structure are present.
+ */
+typedef struct {
+/*Ofs*/
+/*  0   NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+        NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */
+        le16 usa_ofs;           /* See NTFS_RECORD definition above. */
+        le16 usa_count;         /* See NTFS_RECORD definition above. */
+/*  8*/ le64 lsn;               /* $LogFile sequence number for this record.
+                                   Changed every time the record is modified. */
+/* 16*/ le16 sequence_number;   /* Number of times this mft record has been
+                                   reused. (See description for MFT_REF
+                                   above.) NOTE: The increment (skipping zero)
+                                   is done when the file is deleted. NOTE: If
+                                   this is zero it is left zero. */
+/* 18*/ le16 link_count;        /* Number of hard links, i.e. the number of
+                                   directory entries referencing this record.
+                                   NOTE: Only used in mft base records.
+                                   NOTE: When deleting a directory entry we
+                                   check the link_count and if it is 1 we
+                                   delete the file. Otherwise we delete the
+                                   FILE_NAME_ATTR being referenced by the
+                                   directory entry from the mft record and
+                                   decrement the link_count.
+                                   FIXME: Careful with Win32 + DOS names! */
+/* 20*/ le16 attrs_offset;      /* Byte offset to the first attribute in this
+                                   mft record from the start of the mft record.
+                                   NOTE: Must be aligned to 8-byte boundary. */
+/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file
+                                   is deleted, the MFT_RECORD_IN_USE flag is
+                                   set to zero. */
+/* 24*/ le32 bytes_in_use;      /* Number of bytes used in this mft record.
+                                   NOTE: Must be aligned to 8-byte boundary. */
+/* 28*/ le32 bytes_allocated;   /* Number of bytes allocated for this mft
+                                   record. This should be equal to the mft
+                                   record size. */
+/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records.
+                                   When it is not zero it is a mft reference
+                                   pointing to the base mft record to which
+                                   this record belongs (this is then used to
+                                   locate the attribute list attribute present
+                                   in the base record which describes this
+                                   extension record and hence might need
+                                   modification when the extension record
+                                   itself is modified, also locating the
+                                   attribute list also means finding the other
+                                   potential extents, belonging to the non-base
+                                   mft record). */
+/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to
+                                   the next attribute added to this mft record.
+                                   NOTE: Incremented each time after it is used.
+                                   NOTE: Every time the mft record is reused
+                                   this number is set to zero.  NOTE: The first
+                                   instance number is always 0. */
+/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */
+/* 42*/ le16 reserved;          /* Reserved/alignment. */
+/* 44*/ le32 mft_record_number; /* Number of this mft record. */
+/* sizeof() = 48 bytes */
+/*
+ * When (re)using the mft record, we place the update sequence array at this
+ * offset, i.e. before we start with the attributes.  This also makes sense,
+ * otherwise we could run into problems with the update sequence array
+ * containing in itself the last two bytes of a sector which would mean that
+ * multi sector transfer protection wouldn't work.  As you can't protect data
+ * by overwriting it since you then can't get it back...
+ * When reading we obviously use the data from the ntfs record header.
+ */
+} __attribute__ ((__packed__)) MFT_RECORD;
+/* This is the version without the NTFS 3.1+ specific fields. */
+typedef struct {
+/*Ofs*/
+/*  0   NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+        NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */
+        le16 usa_ofs;           /* See NTFS_RECORD definition above. */
+        le16 usa_count;         /* See NTFS_RECORD definition above. */
+/*  8*/ le64 lsn;               /* $LogFile sequence number for this record.
+                                   Changed every time the record is modified. */
+/* 16*/ le16 sequence_number;   /* Number of times this mft record has been
+                                   reused. (See description for MFT_REF
+                                   above.) NOTE: The increment (skipping zero)
+                                   is done when the file is deleted. NOTE: If
+                                   this is zero it is left zero. */
+/* 18*/ le16 link_count;        /* Number of hard links, i.e. the number of
+                                   directory entries referencing this record.
+                                   NOTE: Only used in mft base records.
+                                   NOTE: When deleting a directory entry we
+                                   check the link_count and if it is 1 we
+                                   delete the file. Otherwise we delete the
+                                   FILE_NAME_ATTR being referenced by the
+                                   directory entry from the mft record and
+                                   decrement the link_count.
+                                   FIXME: Careful with Win32 + DOS names! */
+/* 20*/ le16 attrs_offset;      /* Byte offset to the first attribute in this
+                                   mft record from the start of the mft record.
+                                   NOTE: Must be aligned to 8-byte boundary. */
+/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file
+                                   is deleted, the MFT_RECORD_IN_USE flag is
+                                   set to zero. */
+/* 24*/ le32 bytes_in_use;      /* Number of bytes used in this mft record.
+                                   NOTE: Must be aligned to 8-byte boundary. */
+/* 28*/ le32 bytes_allocated;   /* Number of bytes allocated for this mft
+                                   record. This should be equal to the mft
+                                   record size. */
+/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records.
+                                   When it is not zero it is a mft reference
+                                   pointing to the base mft record to which
+                                   this record belongs (this is then used to
+                                   locate the attribute list attribute present
+                                   in the base record which describes this
+                                   extension record and hence might need
+                                   modification when the extension record
+                                   itself is modified, also locating the
+                                   attribute list also means finding the other
+                                   potential extents, belonging to the non-base
+                                   mft record). */
+/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to
+                                   the next attribute added to this mft record.
+                                   NOTE: Incremented each time after it is used.
+                                   NOTE: Every time the mft record is reused
+                                   this number is set to zero.  NOTE: The first
+                                   instance number is always 0. */
+/* sizeof() = 42 bytes */
+/*
+ * When (re)using the mft record, we place the update sequence array at this
+ * offset, i.e. before we start with the attributes.  This also makes sense,
+ * otherwise we could run into problems with the update sequence array
+ * containing in itself the last two bytes of a sector which would mean that
+ * multi sector transfer protection wouldn't work.  As you can't protect data
+ * by overwriting it since you then can't get it back...
+ * When reading we obviously use the data from the ntfs record header.
+ */
+} __attribute__ ((__packed__)) MFT_RECORD_OLD;
+/*
+ * System defined attributes (32-bit).  Each attribute type has a corresponding
+ * attribute name (Unicode string of maximum 64 character length) as described
+ * by the attribute definitions present in the data attribute of the $AttrDef
+ * system file.  On NTFS 3.0 volumes the names are just as the types are named
+ * in the below defines exchanging AT_ for the dollar sign ($).  If that is not
+ * a revealing choice of symbol I do not know what is... (-;
+ */
+enum {
+        AT_UNUSED                       = const_cpu_to_le32(         0),
+        AT_STANDARD_INFORMATION         = const_cpu_to_le32(      0x10),
+        AT_ATTRIBUTE_LIST               = const_cpu_to_le32(      0x20),
+        AT_FILE_NAME                    = const_cpu_to_le32(      0x30),
+        AT_OBJECT_ID                    = const_cpu_to_le32(      0x40),
+        AT_SECURITY_DESCRIPTOR          = const_cpu_to_le32(      0x50),
+        AT_VOLUME_NAME                  = const_cpu_to_le32(      0x60),
+        AT_VOLUME_INFORMATION           = const_cpu_to_le32(      0x70),
+        AT_DATA                         = const_cpu_to_le32(      0x80),
+        AT_INDEX_ROOT                   = const_cpu_to_le32(      0x90),
+        AT_INDEX_ALLOCATION             = const_cpu_to_le32(      0xa0),
+        AT_BITMAP                       = const_cpu_to_le32(      0xb0),
+        AT_REPARSE_POINT                = const_cpu_to_le32(      0xc0),
+        AT_EA_INFORMATION               = const_cpu_to_le32(      0xd0),
+        AT_EA                           = const_cpu_to_le32(      0xe0),
+        AT_PROPERTY_SET                 = const_cpu_to_le32(      0xf0),
+        AT_LOGGED_UTILITY_STREAM        = const_cpu_to_le32(     0x100),
+        AT_FIRST_USER_DEFINED_ATTRIBUTE = const_cpu_to_le32(    0x1000),
+        AT_END                          = const_cpu_to_le32(0xffffffff)
+};
+typedef le32 ATTR_TYPE;
+/*
+ * The collation rules for sorting views/indexes/etc (32-bit).
+ *
+ * COLLATION_BINARY - Collate by binary compare where the first byte is most
+ *      significant.
+ * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary
+ *      Unicode values, except that when a character can be uppercased, the
+ *      upper case value collates before the lower case one.
+ * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation
+ *      is done very much like COLLATION_UNICODE_STRING. In fact I have no idea
+ *      what the difference is. Perhaps the difference is that file names
+ *      would treat some special characters in an odd way (see
+ *      unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[]
+ *      for what I mean but COLLATION_UNICODE_STRING would not give any special
+ *      treatment to any characters at all, but this is speculation.
+ * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key
+ *      values. E.g. used for $SII index in FILE_Secure, which sorts by
+ *      security_id (le32).
+ * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values.
+ *      E.g. used for $O index in FILE_Extend/$Quota.
+ * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash
+ *      values and second by ascending security_id values. E.g. used for $SDH
+ *      index in FILE_Secure.
+ * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending
+ *      le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which
+ *      sorts by object_id (16-byte), by splitting up the object_id in four
+ *      le32 values and using them as individual keys. E.g. take the following
+ *      two security_ids, stored as follows on disk:
+ *              1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59
+ *              2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45
+ *      To compare them, they are split into four le32 values each, like so:
+ *              1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081
+ *              2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179
+ *      Now, it is apparent why the 2nd object_id collates after the 1st: the
+ *      first le32 value of the 1st object_id is less than the first le32 of
+ *      the 2nd object_id. If the first le32 values of both object_ids were
+ *      equal then the second le32 values would be compared, etc.
+ */
+enum {
+        COLLATION_BINARY                = const_cpu_to_le32(0x00),
+        COLLATION_FILE_NAME             = const_cpu_to_le32(0x01),
+        COLLATION_UNICODE_STRING        = const_cpu_to_le32(0x02),
+        COLLATION_NTOFS_ULONG           = const_cpu_to_le32(0x10),
+        COLLATION_NTOFS_SID             = const_cpu_to_le32(0x11),
+        COLLATION_NTOFS_SECURITY_HASH   = const_cpu_to_le32(0x12),
+        COLLATION_NTOFS_ULONGS          = const_cpu_to_le32(0x13)
+};
+typedef le32 COLLATION_RULE;
+/*
+ * The flags (32-bit) describing attribute properties in the attribute
+ * definition structure.  FIXME: This information is from Regis's information
+ * and, according to him, it is not certain and probably incomplete.
+ * The INDEXABLE flag is fairly certainly correct as only the file name
+ * attribute has this flag set and this is the only attribute indexed in NT4.
+ */
+enum {
+        INDEXABLE           = const_cpu_to_le32(0x02), /* Attribute can be
+                                                          indexed. */
+        NEED_TO_REGENERATE  = const_cpu_to_le32(0x40), /* Need to regenerate
+                                                          during regeneration
+                                                          phase. */
+        CAN_BE_NON_RESIDENT = const_cpu_to_le32(0x80), /* Attribute can be
+                                                          non-resident. */
+};
+typedef le32 ATTR_DEF_FLAGS;
+/*
+ * The data attribute of FILE_AttrDef contains a sequence of attribute
+ * definitions for the NTFS volume. With this, it is supposed to be safe for an
+ * older NTFS driver to mount a volume containing a newer NTFS version without
+ * damaging it (that's the theory. In practice it's: not damaging it too much).
+ * Entries are sorted by attribute type. The flags describe whether the
+ * attribute can be resident/non-resident and possibly other things, but the
+ * actual bits are unknown.
+ */
+typedef struct {
+/*hex ofs*/
+/*  0*/ ntfschar name[0x40];            /* Unicode name of the attribute. Zero
+                                           terminated. */
+/* 80*/ ATTR_TYPE type;                 /* Type of the attribute. */
+/* 84*/ le32 display_rule;              /* Default display rule.
+                                           FIXME: What does it mean? (AIA) */
+/* 88*/ COLLATION_RULE collation_rule;  /* Default collation rule. */
+/* 8c*/ ATTR_DEF_FLAGS flags;           /* Flags describing the attribute. */
+/* 90*/ sle64 min_size;                 /* Optional minimum attribute size. */
+/* 98*/ sle64 max_size;                 /* Maximum size of attribute. */
+/* sizeof() = 0xa0 or 160 bytes */
+} __attribute__ ((__packed__)) ATTR_DEF;
+/*
+ * Attribute flags (16-bit).
+ */
+enum {
+        ATTR_IS_COMPRESSED    = const_cpu_to_le16(0x0001),
+        ATTR_COMPRESSION_MASK = const_cpu_to_le16(0x00ff), /* Compression method
+                                                              mask.  Also, first
+                                                              illegal value. */
+        ATTR_IS_ENCRYPTED     = const_cpu_to_le16(0x4000),
+        ATTR_IS_SPARSE        = const_cpu_to_le16(0x8000),
+} __attribute__ ((__packed__));
+typedef le16 ATTR_FLAGS;
+/*
+ * Attribute compression.
+ *
+ * Only the data attribute is ever compressed in the current ntfs driver in
+ * Windows. Further, compression is only applied when the data attribute is
+ * non-resident. Finally, to use compression, the maximum allowed cluster size
+ * on a volume is 4kib.
+ *
+ * The compression method is based on independently compressing blocks of X
+ * clusters, where X is determined from the compression_unit value found in the
+ * non-resident attribute record header (more precisely: X = 2^compression_unit
+ * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4).
+ *
+ * There are three different cases of how a compression block of X clusters
+ * can be stored:
+ *
+ *   1) The data in the block is all zero (a sparse block):
+ *        This is stored as a sparse block in the runlist, i.e. the runlist
+ *        entry has length = X and lcn = -1. The mapping pairs array actually
+ *        uses a delta_lcn value length of 0, i.e. delta_lcn is not present at
+ *        all, which is then interpreted by the driver as lcn = -1.
+ *        NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then
+ *        the same principles apply as above, except that the length is not
+ *        restricted to being any particular value.
+ *
+ *   2) The data in the block is not compressed:
+ *        This happens when compression doesn't reduce the size of the block
+ *        in clusters. I.e. if compression has a small effect so that the
+ *        compressed data still occupies X clusters, then the uncompressed data
+ *        is stored in the block.
+ *        This case is recognised by the fact that the runlist entry has
+ *        length = X and lcn >= 0. The mapping pairs array stores this as
+ *        normal with a run length of X and some specific delta_lcn, i.e.
+ *        delta_lcn has to be present.
+ *
+ *   3) The data in the block is compressed:
+ *        The common case. This case is recognised by the fact that the run
+ *        list entry has length L < X and lcn >= 0. The mapping pairs array
+ *        stores this as normal with a run length of X and some specific
+ *        delta_lcn, i.e. delta_lcn has to be present. This runlist entry is
+ *        immediately followed by a sparse entry with length = X - L and
+ *        lcn = -1. The latter entry is to make up the vcn counting to the
+ *        full compression block size X.
+ *
+ * In fact, life is more complicated because adjacent entries of the same type
+ * can be coalesced. This means that one has to keep track of the number of
+ * clusters handled and work on a basis of X clusters at a time being one
+ * block. An example: if length L > X this means that this particular runlist
+ * entry contains a block of length X and part of one or more blocks of length
+ * L - X. Another example: if length L < X, this does not necessarily mean that
+ * the block is compressed as it might be that the lcn changes inside the block
+ * and hence the following runlist entry describes the continuation of the
+ * potentially compressed block. The block would be compressed if the
+ * following runlist entry describes at least X - L sparse clusters, thus
+ * making up the compression block length as described in point 3 above. (Of
+ * course, there can be several runlist entries with small lengths so that the
+ * sparse entry does not follow the first data containing entry with
+ * length < X.)
+ *
+ * NOTE: At the end of the compressed attribute value, there most likely is not
+ * just the right amount of data to make up a compression block, thus this data
+ * is not even attempted to be compressed. It is just stored as is, unless
+ * the number of clusters it occupies is reduced when compressed in which case
+ * it is stored as a compressed compression block, complete with sparse
+ * clusters at the end.
+ */
+/*
+ * Flags of resident attributes (8-bit).
+ */
+enum {
+        RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index
+                                            (has implications for deleting and
+                                            modifying the attribute). */
+} __attribute__ ((__packed__));
+typedef u8 RESIDENT_ATTR_FLAGS;
+/*
+ * Attribute record header. Always aligned to 8-byte boundary.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/ ATTR_TYPE type;         /* The (32-bit) type of the attribute. */
+/*  4*/ le32 length;            /* Byte size of the resident part of the
+                                   attribute (aligned to 8-byte boundary).
+                                   Used to get to the next attribute. */
+/*  8*/ u8 non_resident;        /* If 0, attribute is resident.
+                                   If 1, attribute is non-resident. */
+/*  9*/ u8 name_length;         /* Unicode character size of name of attribute.
+                                   0 if unnamed. */
+/* 10*/ le16 name_offset;       /* If name_length != 0, the byte offset to the
+                                   beginning of the name from the attribute
+                                   record. Note that the name is stored as a
+                                   Unicode string. When creating, place offset
+                                   just at the end of the record header. Then,
+                                   follow with attribute value or mapping pairs
+                                   array, resident and non-resident attributes
+                                   respectively, aligning to an 8-byte
+                                   boundary. */
+/* 12*/ ATTR_FLAGS flags;       /* Flags describing the attribute. */
+/* 14*/ le16 instance;          /* The instance of this attribute record. This
+                                   number is unique within this mft record (see
+                                   MFT_RECORD/next_attribute_instance notes in
+                                   in mft.h for more details). */
+/* 16*/ union {
+                /* Resident attributes. */
+                struct {
+/* 16 */                le32 value_length;/* Byte size of attribute value. */
+/* 20 */                le16 value_offset;/* Byte offset of the attribute
+                                             value from the start of the
+                                             attribute record. When creating,
+                                             align to 8-byte boundary if we
+                                             have a name present as this might
+                                             not have a length of a multiple
+                                             of 8-bytes. */
+/* 22 */                RESIDENT_ATTR_FLAGS flags; /* See above. */
+/* 23 */                s8 reserved;      /* Reserved/alignment to 8-byte
+                                             boundary. */
+                } __attribute__ ((__packed__)) resident;
+                /* Non-resident attributes. */
+                struct {
+/* 16*/                 leVCN lowest_vcn;/* Lowest valid virtual cluster number
+                                for this portion of the attribute value or
+                                0 if this is the only extent (usually the
+                                case). - Only when an attribute list is used
+                                does lowest_vcn != 0 ever occur. */
+/* 24*/                 leVCN highest_vcn;/* Highest valid vcn of this extent of
+                                the attribute value. - Usually there is only one
+                                portion, so this usually equals the attribute
+                                value size in clusters minus 1. Can be -1 for
+                                zero length files. Can be 0 for "single extent"
+                                attributes. */
+/* 32*/                 le16 mapping_pairs_offset; /* Byte offset from the
+                                beginning of the structure to the mapping pairs
+                                array which contains the mappings between the
+                                vcns and the logical cluster numbers (lcns).
+                                When creating, place this at the end of this
+                                record header aligned to 8-byte boundary. */
+/* 34*/                 u8 compression_unit; /* The compression unit expressed
+                                as the log to the base 2 of the number of
+                                clusters in a compression unit. 0 means not
+                                compressed. (This effectively limits the
+                                compression unit size to be a power of two
+                                clusters.) WinNT4 only uses a value of 4. */
+/* 35*/                 u8 reserved[5];         /* Align to 8-byte boundary. */
+/* The sizes below are only used when lowest_vcn is zero, as otherwise it would
+   be difficult to keep them up-to-date.*/
+/* 40*/                 sle64 allocated_size;   /* Byte size of disk space
+                                allocated to hold the attribute value. Always
+                                is a multiple of the cluster size. When a file
+                                is compressed, this field is a multiple of the
+                                compression block size (2^compression_unit) and
+                                it represents the logically allocated space
+                                rather than the actual on disk usage. For this
+                                use the compressed_size (see below). */
+/* 48*/                 sle64 data_size;        /* Byte size of the attribute
+                                value. Can be larger than allocated_size if
+                                attribute value is compressed or sparse. */
+/* 56*/                 sle64 initialized_size; /* Byte size of initialized
+                                portion of the attribute value. Usually equals
+                                data_size. */
+/* sizeof(uncompressed attr) = 64*/
+/* 64*/                 sle64 compressed_size;  /* Byte size of the attribute
+                                value after compression. Only present when
+                                compressed. Always is a multiple of the
+                                cluster size. Represents the actual amount of
+                                disk space being used on the disk. */
+/* sizeof(compressed attr) = 72*/
+                } __attribute__ ((__packed__)) non_resident;
+        } __attribute__ ((__packed__)) data;
+} __attribute__ ((__packed__)) ATTR_RECORD;
+typedef ATTR_RECORD ATTR_REC;
+/*
+ * File attribute flags (32-bit).
+ */
+enum {
+        /*
+         * The following flags are only present in the STANDARD_INFORMATION
+         * attribute (in the field file_attributes).
+         */
+        FILE_ATTR_READONLY              = const_cpu_to_le32(0x00000001),
+        FILE_ATTR_HIDDEN                = const_cpu_to_le32(0x00000002),
+        FILE_ATTR_SYSTEM                = const_cpu_to_le32(0x00000004),
+        /* Old DOS volid. Unused in NT. = const_cpu_to_le32(0x00000008), */
+        FILE_ATTR_DIRECTORY             = const_cpu_to_le32(0x00000010),
+        /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT.  It is
+           reserved for the DOS SUBDIRECTORY flag. */
+        FILE_ATTR_ARCHIVE               = const_cpu_to_le32(0x00000020),
+        FILE_ATTR_DEVICE                = const_cpu_to_le32(0x00000040),
+        FILE_ATTR_NORMAL                = const_cpu_to_le32(0x00000080),
+        FILE_ATTR_TEMPORARY             = const_cpu_to_le32(0x00000100),
+        FILE_ATTR_SPARSE_FILE           = const_cpu_to_le32(0x00000200),
+        FILE_ATTR_REPARSE_POINT         = const_cpu_to_le32(0x00000400),
+        FILE_ATTR_COMPRESSED            = const_cpu_to_le32(0x00000800),
+        FILE_ATTR_OFFLINE               = const_cpu_to_le32(0x00001000),
+        FILE_ATTR_NOT_CONTENT_INDEXED   = const_cpu_to_le32(0x00002000),
+        FILE_ATTR_ENCRYPTED             = const_cpu_to_le32(0x00004000),
+        FILE_ATTR_VALID_FLAGS           = const_cpu_to_le32(0x00007fb7),
+        /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
+           FILE_ATTR_DEVICE and preserves everything else.  This mask is used
+           to obtain all flags that are valid for reading. */
+        FILE_ATTR_VALID_SET_FLAGS       = const_cpu_to_le32(0x000031a7),
+        /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
+           F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
+           F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest.  This mask
+           is used to to obtain all flags that are valid for setting. */
+        /*
+         * The following flags are only present in the FILE_NAME attribute (in
+         * the field file_attributes).
+         */
+        FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT   = const_cpu_to_le32(0x10000000),
+        /* Note, this is a copy of the corresponding bit from the mft record,
+           telling us whether this is a directory or not, i.e. whether it has
+           an index root attribute or not. */
+        FILE_ATTR_DUP_VIEW_INDEX_PRESENT        = const_cpu_to_le32(0x20000000),
+        /* Note, this is a copy of the corresponding bit from the mft record,
+           telling us whether this file has a view index present (eg. object id
+           index, quota index, one of the security indexes or the encrypting
+           file system related indexes). */
+};
+typedef le32 FILE_ATTR_FLAGS;
+/*
+ * NOTE on times in NTFS: All times are in MS standard time format, i.e. they
+ * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00
+ * universal coordinated time (UTC). (In Linux time starts 1st January 1970,
+ * 00:00:00 UTC and is stored as the number of 1-second intervals since then.)
+ */
+/*
+ * Attribute: Standard information (0x10).
+ *
+ * NOTE: Always resident.
+ * NOTE: Present in all base file records on a volume.
+ * NOTE: There is conflicting information about the meaning of each of the time
+ *       fields but the meaning as defined below has been verified to be
+ *       correct by practical experimentation on Windows NT4 SP6a and is hence
+ *       assumed to be the one and only correct interpretation.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/ sle64 creation_time;            /* Time file was created. Updated when
+                                           a filename is changed(?). */
+/*  8*/ sle64 last_data_change_time;    /* Time the data attribute was last
+                                           modified. */
+/* 16*/ sle64 last_mft_change_time;     /* Time this mft record was last
+                                           modified. */
+/* 24*/ sle64 last_access_time;         /* Approximate time when the file was
+                                           last accessed (obviously this is not
+                                           updated on read-only volumes). In
+                                           Windows this is only updated when
+                                           accessed if some time delta has
+                                           passed since the last update. Also,
+                                           last access times updates can be
+                                           disabled altogether for speed. */
+/* 32*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */
+/* 36*/ union {
+        /* NTFS 1.2 */
+                struct {
+                /* 36*/ u8 reserved12[12];      /* Reserved/alignment to 8-byte
+                                                   boundary. */
+                } __attribute__ ((__packed__)) v1;
+        /* sizeof() = 48 bytes */
+        /* NTFS 3.x */
+                struct {
+/*
+ * If a volume has been upgraded from a previous NTFS version, then these
+ * fields are present only if the file has been accessed since the upgrade.
+ * Recognize the difference by comparing the length of the resident attribute
+ * value. If it is 48, then the following fields are missing. If it is 72 then
+ * the fields are present. Maybe just check like this:
+ *      if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) {
+ *              Assume NTFS 1.2- format.
+ *              If (volume version is 3.x)
+ *                      Upgrade attribute to NTFS 3.x format.
+ *              else
+ *                      Use NTFS 1.2- format for access.
+ *      } else
+ *              Use NTFS 3.x format for access.
+ * Only problem is that it might be legal to set the length of the value to
+ * arbitrarily large values thus spoiling this check. - But chkdsk probably
+ * views that as a corruption, assuming that it behaves like this for all
+ * attributes.
+ */
+                /* 36*/ le32 maximum_versions;  /* Maximum allowed versions for
+                                file. Zero if version numbering is disabled. */
+                /* 40*/ le32 version_number;    /* This file's version (if any).
+                                Set to zero if maximum_versions is zero. */
+                /* 44*/ le32 class_id;          /* Class id from bidirectional
+                                class id index (?). */
+                /* 48*/ le32 owner_id;          /* Owner_id of the user owning
+                                the file. Translate via $Q index in FILE_Extend
+                                /$Quota to the quota control entry for the user
+                                owning the file. Zero if quotas are disabled. */
+                /* 52*/ le32 security_id;       /* Security_id for the file.
+                                Translate via $SII index and $SDS data stream
+                                in FILE_Secure to the security descriptor. */
+                /* 56*/ le64 quota_charged;     /* Byte size of the charge to
+                                the quota for all streams of the file. Note: Is
+                                zero if quotas are disabled. */
+                /* 64*/ le64 usn;               /* Last update sequence number
+                                of the file. This is a direct index into the
+                                change (aka usn) journal file. It is zero if
+                                the usn journal is disabled.
+                                NOTE: To disable the journal need to delete
+                                the journal file itself and to then walk the
+                                whole mft and set all Usn entries in all mft
+                                records to zero! (This can take a while!)
+                                The journal is FILE_Extend/$UsnJrnl. Win2k
+                                will recreate the journal and initiate
+                                logging if necessary when mounting the
+                                partition. This, in contrast to disabling the
+                                journal is a very fast process, so the user
+                                won't even notice it. */
+                } __attribute__ ((__packed__)) v3;
+        /* sizeof() = 72 bytes (NTFS 3.x) */
+        } __attribute__ ((__packed__)) ver;
+} __attribute__ ((__packed__)) STANDARD_INFORMATION;
+/*
+ * Attribute: Attribute list (0x20).
+ *
+ * - Can be either resident or non-resident.
+ * - Value consists of a sequence of variable length, 8-byte aligned,
+ * ATTR_LIST_ENTRY records.
+ * - The list is not terminated by anything at all! The only way to know when
+ * the end is reached is to keep track of the current offset and compare it to
+ * the attribute value size.
+ * - The attribute list attribute contains one entry for each attribute of
+ * the file in which the list is located, except for the list attribute
+ * itself. The list is sorted: first by attribute type, second by attribute
+ * name (if present), third by instance number. The extents of one
+ * non-resident attribute (if present) immediately follow after the initial
+ * extent. They are ordered by lowest_vcn and have their instace set to zero.
+ * It is not allowed to have two attributes with all sorting keys equal.
+ * - Further restrictions:
+ *      - If not resident, the vcn to lcn mapping array has to fit inside the
+ *        base mft record.
+ *      - The attribute list attribute value has a maximum size of 256kb. This
+ *        is imposed by the Windows cache manager.
+ * - Attribute lists are only used when the attributes of mft record do not
+ * fit inside the mft record despite all attributes (that can be made
+ * non-resident) having been made non-resident. This can happen e.g. when:
+ *      - File has a large number of hard links (lots of file name
+ *        attributes present).
+ *      - The mapping pairs array of some non-resident attribute becomes so
+ *        large due to fragmentation that it overflows the mft record.
+ *      - The security descriptor is very complex (not applicable to
+ *        NTFS 3.0 volumes).
+ *      - There are many named streams.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/ ATTR_TYPE type;         /* Type of referenced attribute. */
+/*  4*/ le16 length;            /* Byte size of this entry (8-byte aligned). */
+/*  6*/ u8 name_length;         /* Size in Unicode chars of the name of the
+                                   attribute or 0 if unnamed. */
+/*  7*/ u8 name_offset;         /* Byte offset to beginning of attribute name
+                                   (always set this to where the name would
+                                   start even if unnamed). */
+/*  8*/ leVCN lowest_vcn;       /* Lowest virtual cluster number of this portion
+                                   of the attribute value. This is usually 0. It
+                                   is non-zero for the case where one attribute
+                                   does not fit into one mft record and thus
+                                   several mft records are allocated to hold
+                                   this attribute. In the latter case, each mft
+                                   record holds one extent of the attribute and
+                                   there is one attribute list entry for each
+                                   extent. NOTE: This is DEFINITELY a signed
+                                   value! The windows driver uses cmp, followed
+                                   by jg when comparing this, thus it treats it
+                                   as signed. */
+/* 16*/ leMFT_REF mft_reference;/* The reference of the mft record holding
+                                   the ATTR_RECORD for this portion of the
+                                   attribute value. */
+/* 24*/ le16 instance;          /* If lowest_vcn = 0, the instance of the
+                                   attribute being referenced; otherwise 0. */
+/* 26*/ ntfschar name[0];       /* Use when creating only. When reading use
+                                   name_offset to determine the location of the
+                                   name. */
+/* sizeof() = 26 + (attribute_name_length * 2) bytes */
+} __attribute__ ((__packed__)) ATTR_LIST_ENTRY;
+/*
+ * The maximum allowed length for a file name.
+ */
+#define MAXIMUM_FILE_NAME_LENGTH        255
+/*
+ * Possible namespaces for filenames in ntfs (8-bit).
+ */
+enum {
+        FILE_NAME_POSIX         = 0x00,
+        /* This is the largest namespace. It is case sensitive and allows all
+           Unicode characters except for: '\0' and '/'.  Beware that in
+           WinNT/2k files which eg have the same name except for their case
+           will not be distinguished by the standard utilities and thus a "del
+           filename" will delete both "filename" and "fileName" without
+           warning. */
+        FILE_NAME_WIN32         = 0x01,
+        /* The standard WinNT/2k NTFS long filenames. Case insensitive.  All
+           Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\',
+           and '|'.  Further, names cannot end with a '.' or a space. */
+        FILE_NAME_DOS           = 0x02,
+        /* The standard DOS filenames (8.3 format). Uppercase only.  All 8-bit
+           characters greater space, except: '"', '*', '+', ',', '/', ':', ';',
+           '<', '=', '>', '?', and '\'. */
+        FILE_NAME_WIN32_AND_DOS = 0x03,
+        /* 3 means that both the Win32 and the DOS filenames are identical and
+           hence have been saved in this single filename record. */
+} __attribute__ ((__packed__));
+typedef u8 FILE_NAME_TYPE_FLAGS;
+/*
+ * Attribute: Filename (0x30).
+ *
+ * NOTE: Always resident.
+ * NOTE: All fields, except the parent_directory, are only updated when the
+ *       filename is changed. Until then, they just become out of sync with
+ *       reality and the more up to date values are present in the standard
+ *       information attribute.
+ * NOTE: There is conflicting information about the meaning of each of the time
+ *       fields but the meaning as defined below has been verified to be
+ *       correct by practical experimentation on Windows NT4 SP6a and is hence
+ *       assumed to be the one and only correct interpretation.
+ */
+typedef struct {
+/*hex ofs*/
+/*  0*/ leMFT_REF parent_directory;     /* Directory this filename is
+                                           referenced from. */
+/*  8*/ sle64 creation_time;            /* Time file was created. */
+/* 10*/ sle64 last_data_change_time;    /* Time the data attribute was last
+                                           modified. */
+/* 18*/ sle64 last_mft_change_time;     /* Time this mft record was last
+                                           modified. */
+/* 20*/ sle64 last_access_time;         /* Time this mft record was last
+                                           accessed. */
+/* 28*/ sle64 allocated_size;           /* Byte size of allocated space for the
+                                           data attribute. NOTE: Is a multiple
+                                           of the cluster size. */
+/* 30*/ sle64 data_size;                /* Byte size of actual data in data
+                                           attribute. */
+/* 38*/ FILE_ATTR_FLAGS file_attributes;        /* Flags describing the file. */
+/* 3c*/ union {
+        /* 3c*/ struct {
+                /* 3c*/ le16 packed_ea_size;    /* Size of the buffer needed to
+                                                   pack the extended attributes
+                                                   (EAs), if such are present.*/
+                /* 3e*/ le16 reserved;          /* Reserved for alignment. */
+                } __attribute__ ((__packed__)) ea;
+        /* 3c*/ struct {
+                /* 3c*/ le32 reparse_point_tag; /* Type of reparse point,
+                                                   present only in reparse
+                                                   points and only if there are
+                                                   no EAs. */
+                } __attribute__ ((__packed__)) rp;
+        } __attribute__ ((__packed__)) type;
+/* 40*/ u8 file_name_length;                    /* Length of file name in
+                                                   (Unicode) characters. */
+/* 41*/ FILE_NAME_TYPE_FLAGS file_name_type;    /* Namespace of the file name.*/
+/* 42*/ ntfschar file_name[0];                  /* File name in Unicode. */
+} __attribute__ ((__packed__)) FILE_NAME_ATTR;
+/*
+ * GUID structures store globally unique identifiers (GUID). A GUID is a
+ * 128-bit value consisting of one group of eight hexadecimal digits, followed
+ * by three groups of four hexadecimal digits each, followed by one group of
+ * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the
+ * distributed computing environment (DCE) universally unique identifier (UUID).
+ * Example of a GUID:
+ *      1F010768-5A73-BC91-0010A52216A7
+ */
+typedef struct {
+        le32 data1;     /* The first eight hexadecimal digits of the GUID. */
+        le16 data2;     /* The first group of four hexadecimal digits. */
+        le16 data3;     /* The second group of four hexadecimal digits. */
+        u8 data4[8];    /* The first two bytes are the third group of four
+                           hexadecimal digits. The remaining six bytes are the
+                           final 12 hexadecimal digits. */
+} __attribute__ ((__packed__)) GUID;
+/*
+ * FILE_Extend/$ObjId contains an index named $O. This index contains all
+ * object_ids present on the volume as the index keys and the corresponding
+ * mft_record numbers as the index entry data parts. The data part (defined
+ * below) also contains three other object_ids:
+ *      birth_volume_id - object_id of FILE_Volume on which the file was first
+ *                        created. Optional (i.e. can be zero).
+ *      birth_object_id - object_id of file when it was first created. Usually
+ *                        equals the object_id. Optional (i.e. can be zero).
+ *      domain_id       - Reserved (always zero).
+ */
+typedef struct {
+        leMFT_REF mft_reference;/* Mft record containing the object_id in
+                                   the index entry key. */
+        union {
+                struct {
+                        GUID birth_volume_id;
+                        GUID birth_object_id;
+                        GUID domain_id;
+                } __attribute__ ((__packed__)) origin;
+                u8 extended_info[48];
+        } __attribute__ ((__packed__)) opt;
+} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA;
+/*
+ * Attribute: Object id (NTFS 3.0+) (0x40).
+ *
+ * NOTE: Always resident.
+ */
+typedef struct {
+        GUID object_id;                         /* Unique id assigned to the
+                                                   file.*/
+        /* The following fields are optional. The attribute value size is 16
+           bytes, i.e. sizeof(GUID), if these are not present at all. Note,
+           the entries can be present but one or more (or all) can be zero
+           meaning that that particular value(s) is(are) not defined. */
+        union {
+                struct {
+                        GUID birth_volume_id;   /* Unique id of volume on which
+                                                   the file was first created.*/
+                        GUID birth_object_id;   /* Unique id of file when it was
+                                                   first created. */
+                        GUID domain_id;         /* Reserved, zero. */
+                } __attribute__ ((__packed__)) origin;
+                u8 extended_info[48];
+        } __attribute__ ((__packed__)) opt;
+} __attribute__ ((__packed__)) OBJECT_ID_ATTR;
+/*
+ * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in
+ * the SID structure (see below).
+ */
+//typedef enum {                                        /* SID string prefix. */
+//      SECURITY_NULL_SID_AUTHORITY     = {0, 0, 0, 0, 0, 0},   /* S-1-0 */
+//      SECURITY_WORLD_SID_AUTHORITY    = {0, 0, 0, 0, 0, 1},   /* S-1-1 */
+//      SECURITY_LOCAL_SID_AUTHORITY    = {0, 0, 0, 0, 0, 2},   /* S-1-2 */
+//      SECURITY_CREATOR_SID_AUTHORITY  = {0, 0, 0, 0, 0, 3},   /* S-1-3 */
+//      SECURITY_NON_UNIQUE_AUTHORITY   = {0, 0, 0, 0, 0, 4},   /* S-1-4 */
+//      SECURITY_NT_SID_AUTHORITY       = {0, 0, 0, 0, 0, 5},   /* S-1-5 */
+//} IDENTIFIER_AUTHORITIES;
+/*
+ * These relative identifiers (RIDs) are used with the above identifier
+ * authorities to make up universal well-known SIDs.
+ *
+ * Note: The relative identifier (RID) refers to the portion of a SID, which
+ * identifies a user or group in relation to the authority that issued the SID.
+ * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is
+ * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and
+ * the relative identifier SECURITY_CREATOR_OWNER_RID (0).
+ */
+typedef enum {                                  /* Identifier authority. */
+        SECURITY_NULL_RID                 = 0,  /* S-1-0 */
+        SECURITY_WORLD_RID                = 0,  /* S-1-1 */
+        SECURITY_LOCAL_RID                = 0,  /* S-1-2 */
+        SECURITY_CREATOR_OWNER_RID        = 0,  /* S-1-3 */
+        SECURITY_CREATOR_GROUP_RID        = 1,  /* S-1-3 */
+        SECURITY_CREATOR_OWNER_SERVER_RID = 2,  /* S-1-3 */
+        SECURITY_CREATOR_GROUP_SERVER_RID = 3,  /* S-1-3 */
+        SECURITY_DIALUP_RID               = 1,
+        SECURITY_NETWORK_RID              = 2,
+        SECURITY_BATCH_RID                = 3,
+        SECURITY_INTERACTIVE_RID          = 4,
+        SECURITY_SERVICE_RID              = 6,
+        SECURITY_ANONYMOUS_LOGON_RID      = 7,
+        SECURITY_PROXY_RID                = 8,
+        SECURITY_ENTERPRISE_CONTROLLERS_RID=9,
+        SECURITY_SERVER_LOGON_RID         = 9,
+        SECURITY_PRINCIPAL_SELF_RID       = 0xa,
+        SECURITY_AUTHENTICATED_USER_RID   = 0xb,
+        SECURITY_RESTRICTED_CODE_RID      = 0xc,
+        SECURITY_TERMINAL_SERVER_RID      = 0xd,
+        SECURITY_LOGON_IDS_RID            = 5,
+        SECURITY_LOGON_IDS_RID_COUNT      = 3,
+        SECURITY_LOCAL_SYSTEM_RID         = 0x12,
+        SECURITY_NT_NON_UNIQUE            = 0x15,
+        SECURITY_BUILTIN_DOMAIN_RID       = 0x20,
+        /*
+         * Well-known domain relative sub-authority values (RIDs).
+         */
+        /* Users. */
+        DOMAIN_USER_RID_ADMIN             = 0x1f4,
+        DOMAIN_USER_RID_GUEST             = 0x1f5,
+        DOMAIN_USER_RID_KRBTGT            = 0x1f6,
+        /* Groups. */
+        DOMAIN_GROUP_RID_ADMINS           = 0x200,
+        DOMAIN_GROUP_RID_USERS            = 0x201,
+        DOMAIN_GROUP_RID_GUESTS           = 0x202,
+        DOMAIN_GROUP_RID_COMPUTERS        = 0x203,
+        DOMAIN_GROUP_RID_CONTROLLERS      = 0x204,
+        DOMAIN_GROUP_RID_CERT_ADMINS      = 0x205,
+        DOMAIN_GROUP_RID_SCHEMA_ADMINS    = 0x206,
+        DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207,
+        DOMAIN_GROUP_RID_POLICY_ADMINS    = 0x208,
+        /* Aliases. */
+        DOMAIN_ALIAS_RID_ADMINS           = 0x220,
+        DOMAIN_ALIAS_RID_USERS            = 0x221,
+        DOMAIN_ALIAS_RID_GUESTS           = 0x222,
+        DOMAIN_ALIAS_RID_POWER_USERS      = 0x223,
+        DOMAIN_ALIAS_RID_ACCOUNT_OPS      = 0x224,
+        DOMAIN_ALIAS_RID_SYSTEM_OPS       = 0x225,
+        DOMAIN_ALIAS_RID_PRINT_OPS        = 0x226,
+        DOMAIN_ALIAS_RID_BACKUP_OPS       = 0x227,
+        DOMAIN_ALIAS_RID_REPLICATOR       = 0x228,
+        DOMAIN_ALIAS_RID_RAS_SERVERS      = 0x229,
+        DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a,
+} RELATIVE_IDENTIFIERS;
+/*
+ * The universal well-known SIDs:
+ *
+ *      NULL_SID                        S-1-0-0
+ *      WORLD_SID                       S-1-1-0
+ *      LOCAL_SID                       S-1-2-0
+ *      CREATOR_OWNER_SID               S-1-3-0
+ *      CREATOR_GROUP_SID               S-1-3-1
+ *      CREATOR_OWNER_SERVER_SID        S-1-3-2
+ *      CREATOR_GROUP_SERVER_SID        S-1-3-3
+ *
+ *      (Non-unique IDs)                S-1-4
+ *
+ * NT well-known SIDs:
+ *
+ *      NT_AUTHORITY_SID        S-1-5
+ *      DIALUP_SID              S-1-5-1
+ *
+ *      NETWORD_SID             S-1-5-2
+ *      BATCH_SID               S-1-5-3
+ *      INTERACTIVE_SID         S-1-5-4
+ *      SERVICE_SID             S-1-5-6
+ *      ANONYMOUS_LOGON_SID     S-1-5-7         (aka null logon session)
+ *      PROXY_SID               S-1-5-8
+ *      SERVER_LOGON_SID        S-1-5-9         (aka domain controller account)
+ *      SELF_SID                S-1-5-10        (self RID)
+ *      AUTHENTICATED_USER_SID  S-1-5-11
+ *      RESTRICTED_CODE_SID     S-1-5-12        (running restricted code)
+ *      TERMINAL_SERVER_SID     S-1-5-13        (running on terminal server)
+ *
+ *      (Logon IDs)             S-1-5-5-X-Y
+ *
+ *      (NT non-unique IDs)     S-1-5-0x15-...
+ *
+ *      (Built-in domain)       S-1-5-0x20
+ */
+/*
+ * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure.
+ *
+ * NOTE: This is stored as a big endian number, hence the high_part comes
+ * before the low_part.
+ */
+typedef union {
+        struct {
+                u16 high_part;  /* High 16-bits. */
+                u32 low_part;   /* Low 32-bits. */
+        } __attribute__ ((__packed__)) parts;
+        u8 value[6];            /* Value as individual bytes. */
+} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY;
+/*
+ * The SID structure is a variable-length structure used to uniquely identify
+ * users or groups. SID stands for security identifier.
+ *
+ * The standard textual representation of the SID is of the form:
+ *      S-R-I-S-S...
+ * Where:
+ *    - The first "S" is the literal character 'S' identifying the following
+ *      digits as a SID.
+ *    - R is the revision level of the SID expressed as a sequence of digits
+ *      either in decimal or hexadecimal (if the later, prefixed by "0x").
+ *    - I is the 48-bit identifier_authority, expressed as digits as R above.
+ *    - S... is one or more sub_authority values, expressed as digits as above.
+ *
+ * Example SID; the domain-relative SID of the local Administrators group on
+ * Windows NT/2k:
+ *      S-1-5-32-544
+ * This translates to a SID with:
+ *      revision = 1,
+ *      sub_authority_count = 2,
+ *      identifier_authority = {0,0,0,0,0,5},   // SECURITY_NT_AUTHORITY
+ *      sub_authority[0] = 32,                  // SECURITY_BUILTIN_DOMAIN_RID
+ *      sub_authority[1] = 544                  // DOMAIN_ALIAS_RID_ADMINS
+ */
+typedef struct {
+        u8 revision;
+        u8 sub_authority_count;
+        SID_IDENTIFIER_AUTHORITY identifier_authority;
+        le32 sub_authority[1];          /* At least one sub_authority. */
+} __attribute__ ((__packed__)) SID;
+/*
+ * Current constants for SIDs.
+ */
+typedef enum {
+        SID_REVISION                    =  1,   /* Current revision level. */
+        SID_MAX_SUB_AUTHORITIES         = 15,   /* Maximum number of those. */
+        SID_RECOMMENDED_SUB_AUTHORITIES =  1,   /* Will change to around 6 in
+                                                   a future revision. */
+} SID_CONSTANTS;
+/*
+ * The predefined ACE types (8-bit, see below).
+ */
+enum {
+        ACCESS_MIN_MS_ACE_TYPE          = 0,
+        ACCESS_ALLOWED_ACE_TYPE         = 0,
+        ACCESS_DENIED_ACE_TYPE          = 1,
+        SYSTEM_AUDIT_ACE_TYPE           = 2,
+        SYSTEM_ALARM_ACE_TYPE           = 3, /* Not implemented as of Win2k. */
+        ACCESS_MAX_MS_V2_ACE_TYPE       = 3,
+        ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4,
+        ACCESS_MAX_MS_V3_ACE_TYPE       = 4,
+        /* The following are Win2k only. */
+        ACCESS_MIN_MS_OBJECT_ACE_TYPE   = 5,
+        ACCESS_ALLOWED_OBJECT_ACE_TYPE  = 5,
+        ACCESS_DENIED_OBJECT_ACE_TYPE   = 6,
+        SYSTEM_AUDIT_OBJECT_ACE_TYPE    = 7,
+        SYSTEM_ALARM_OBJECT_ACE_TYPE    = 8,
+        ACCESS_MAX_MS_OBJECT_ACE_TYPE   = 8,
+        ACCESS_MAX_MS_V4_ACE_TYPE       = 8,
+        /* This one is for WinNT/2k. */
+        ACCESS_MAX_MS_ACE_TYPE          = 8,
+} __attribute__ ((__packed__));
+typedef u8 ACE_TYPES;
+/*
+ * The ACE flags (8-bit) for audit and inheritance (see below).
+ *
+ * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE
+ * types to indicate that a message is generated (in Windows!) for successful
+ * accesses.
+ *
+ * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types
+ * to indicate that a message is generated (in Windows!) for failed accesses.
+ */
+enum {
+        /* The inheritance flags. */
+        OBJECT_INHERIT_ACE              = 0x01,
+        CONTAINER_INHERIT_ACE           = 0x02,
+        NO_PROPAGATE_INHERIT_ACE        = 0x04,
+        INHERIT_ONLY_ACE                = 0x08,
+        INHERITED_ACE                   = 0x10, /* Win2k only. */
+        VALID_INHERIT_FLAGS             = 0x1f,
+        /* The audit flags. */
+        SUCCESSFUL_ACCESS_ACE_FLAG      = 0x40,
+        FAILED_ACCESS_ACE_FLAG          = 0x80,
+} __attribute__ ((__packed__));
+typedef u8 ACE_FLAGS;
+/*
+ * An ACE is an access-control entry in an access-control list (ACL).
+ * An ACE defines access to an object for a specific user or group or defines
+ * the types of access that generate system-administration messages or alarms
+ * for a specific user or group. The user or group is identified by a security
+ * identifier (SID).
+ *
+ * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary),
+ * which specifies the type and size of the ACE. The format of the subsequent
+ * data depends on the ACE type.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/ ACE_TYPES type;         /* Type of the ACE. */
+/*  1*/ ACE_FLAGS flags;        /* Flags describing the ACE. */
+/*  2*/ le16 size;              /* Size in bytes of the ACE. */
+} __attribute__ ((__packed__)) ACE_HEADER;
+/*
+ * The access mask (32-bit). Defines the access rights.
+ *
+ * The specific rights (bits 0 to 15).  These depend on the type of the object
+ * being secured by the ACE.
+ */
+enum {
+        /* Specific rights for files and directories are as follows: */
+        /* Right to read data from the file. (FILE) */
+        FILE_READ_DATA                  = const_cpu_to_le32(0x00000001),
+        /* Right to list contents of a directory. (DIRECTORY) */
+        FILE_LIST_DIRECTORY             = const_cpu_to_le32(0x00000001),
+        /* Right to write data to the file. (FILE) */
+        FILE_WRITE_DATA                 = const_cpu_to_le32(0x00000002),
+        /* Right to create a file in the directory. (DIRECTORY) */
+        FILE_ADD_FILE                   = const_cpu_to_le32(0x00000002),
+        /* Right to append data to the file. (FILE) */
+        FILE_APPEND_DATA                = const_cpu_to_le32(0x00000004),
+        /* Right to create a subdirectory. (DIRECTORY) */
+        FILE_ADD_SUBDIRECTORY           = const_cpu_to_le32(0x00000004),
+        /* Right to read extended attributes. (FILE/DIRECTORY) */
+        FILE_READ_EA                    = const_cpu_to_le32(0x00000008),
+        /* Right to write extended attributes. (FILE/DIRECTORY) */
+        FILE_WRITE_EA                   = const_cpu_to_le32(0x00000010),
+        /* Right to execute a file. (FILE) */
+        FILE_EXECUTE                    = const_cpu_to_le32(0x00000020),
+        /* Right to traverse the directory. (DIRECTORY) */
+        FILE_TRAVERSE                   = const_cpu_to_le32(0x00000020),
+        /*
+         * Right to delete a directory and all the files it contains (its
+         * children), even if the files are read-only. (DIRECTORY)
+         */
+        FILE_DELETE_CHILD               = const_cpu_to_le32(0x00000040),
+        /* Right to read file attributes. (FILE/DIRECTORY) */
+        FILE_READ_ATTRIBUTES            = const_cpu_to_le32(0x00000080),
+        /* Right to change file attributes. (FILE/DIRECTORY) */
+        FILE_WRITE_ATTRIBUTES           = const_cpu_to_le32(0x00000100),
+        /*
+         * The standard rights (bits 16 to 23).  These are independent of the
+         * type of object being secured.
+         */
+        /* Right to delete the object. */
+        DELETE                          = const_cpu_to_le32(0x00010000),
+        /*
+         * Right to read the information in the object's security descriptor,
+         * not including the information in the SACL, i.e. right to read the
+         * security descriptor and owner.
+         */
+        READ_CONTROL                    = const_cpu_to_le32(0x00020000),
+        /* Right to modify the DACL in the object's security descriptor. */
+        WRITE_DAC                       = const_cpu_to_le32(0x00040000),
+        /* Right to change the owner in the object's security descriptor. */
+        WRITE_OWNER                     = const_cpu_to_le32(0x00080000),
+        /*
+         * Right to use the object for synchronization.  Enables a process to
+         * wait until the object is in the signalled state.  Some object types
+         * do not support this access right.
+         */
+        SYNCHRONIZE                     = const_cpu_to_le32(0x00100000),
+        /*
+         * The following STANDARD_RIGHTS_* are combinations of the above for
+         * convenience and are defined by the Win32 API.
+         */
+        /* These are currently defined to READ_CONTROL. */
+        STANDARD_RIGHTS_READ            = const_cpu_to_le32(0x00020000),
+        STANDARD_RIGHTS_WRITE           = const_cpu_to_le32(0x00020000),
+        STANDARD_RIGHTS_EXECUTE         = const_cpu_to_le32(0x00020000),
+        /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
+        STANDARD_RIGHTS_REQUIRED        = const_cpu_to_le32(0x000f0000),
+        /*
+         * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
+         * SYNCHRONIZE access.
+         */
+        STANDARD_RIGHTS_ALL             = const_cpu_to_le32(0x001f0000),
+        /*
+         * The access system ACL and maximum allowed access types (bits 24 to
+         * 25, bits 26 to 27 are reserved).
+         */
+        ACCESS_SYSTEM_SECURITY          = const_cpu_to_le32(0x01000000),
+        MAXIMUM_ALLOWED                 = const_cpu_to_le32(0x02000000),
+        /*
+         * The generic rights (bits 28 to 31).  These map onto the standard and
+         * specific rights.
+         */
+        /* Read, write, and execute access. */
+        GENERIC_ALL                     = const_cpu_to_le32(0x10000000),
+        /* Execute access. */
+        GENERIC_EXECUTE                 = const_cpu_to_le32(0x20000000),
+        /*
+         * Write access.  For files, this maps onto:
+         *      FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA |
+         *      FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE
+         * For directories, the mapping has the same numerical value.  See
+         * above for the descriptions of the rights granted.
+         */
+        GENERIC_WRITE                   = const_cpu_to_le32(0x40000000),
+        /*
+         * Read access.  For files, this maps onto:
+         *      FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA |
+         *      STANDARD_RIGHTS_READ | SYNCHRONIZE
+         * For directories, the mapping has the same numberical value.  See
+         * above for the descriptions of the rights granted.
+         */
+        GENERIC_READ                    = const_cpu_to_le32(0x80000000),
+};
+typedef le32 ACCESS_MASK;
+/*
+ * The generic mapping array. Used to denote the mapping of each generic
+ * access right to a specific access mask.
+ *
+ * FIXME: What exactly is this and what is it for? (AIA)
+ */
+typedef struct {
+        ACCESS_MASK generic_read;
+        ACCESS_MASK generic_write;
+        ACCESS_MASK generic_execute;
+        ACCESS_MASK generic_all;
+} __attribute__ ((__packed__)) GENERIC_MAPPING;
+/*
+ * The predefined ACE type structures are as defined below.
+ */
+/*
+ * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE
+ */
+typedef struct {
+/*  0   ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
+        ACE_TYPES type;         /* Type of the ACE. */
+        ACE_FLAGS flags;        /* Flags describing the ACE. */
+        le16 size;              /* Size in bytes of the ACE. */
+/*  4*/ ACCESS_MASK mask;       /* Access mask associated with the ACE. */
+/*  8*/ SID sid;                /* The SID associated with the ACE. */
+} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE,
+                               SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE;
+/*
+ * The object ACE flags (32-bit).
+ */
+enum {
+        ACE_OBJECT_TYPE_PRESENT                 = const_cpu_to_le32(1),
+        ACE_INHERITED_OBJECT_TYPE_PRESENT       = const_cpu_to_le32(2),
+};
+typedef le32 OBJECT_ACE_FLAGS;
+typedef struct {
+/*  0   ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
+        ACE_TYPES type;         /* Type of the ACE. */
+        ACE_FLAGS flags;        /* Flags describing the ACE. */
+        le16 size;              /* Size in bytes of the ACE. */
+/*  4*/ ACCESS_MASK mask;       /* Access mask associated with the ACE. */
+/*  8*/ OBJECT_ACE_FLAGS object_flags;  /* Flags describing the object ACE. */
+/* 12*/ GUID object_type;
+/* 28*/ GUID inherited_object_type;
+/* 44*/ SID sid;                /* The SID associated with the ACE. */
+} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE,
+                               ACCESS_DENIED_OBJECT_ACE,
+                               SYSTEM_AUDIT_OBJECT_ACE,
+                               SYSTEM_ALARM_OBJECT_ACE;
+/*
+ * An ACL is an access-control list (ACL).
+ * An ACL starts with an ACL header structure, which specifies the size of
+ * the ACL and the number of ACEs it contains. The ACL header is followed by
+ * zero or more access control entries (ACEs). The ACL as well as each ACE
+ * are aligned on 4-byte boundaries.
+ */
+typedef struct {
+        u8 revision;    /* Revision of this ACL. */
+        u8 alignment1;
+        le16 size;      /* Allocated space in bytes for ACL. Includes this
+                           header, the ACEs and the remaining free space. */
+        le16 ace_count; /* Number of ACEs in the ACL. */
+        le16 alignment2;
+/* sizeof() = 8 bytes */
+} __attribute__ ((__packed__)) ACL;
+/*
+ * Current constants for ACLs.
+ */
+typedef enum {
+        /* Current revision. */
+        ACL_REVISION            = 2,
+        ACL_REVISION_DS         = 4,
+        /* History of revisions. */
+        ACL_REVISION1           = 1,
+        MIN_ACL_REVISION        = 2,
+        ACL_REVISION2           = 2,
+        ACL_REVISION3           = 3,
+        ACL_REVISION4           = 4,
+        MAX_ACL_REVISION        = 4,
+} ACL_CONSTANTS;
+/*
+ * The security descriptor control flags (16-bit).
+ *
+ * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID
+ *      pointed to by the Owner field was provided by a defaulting mechanism
+ *      rather than explicitly provided by the original provider of the
+ *      security descriptor.  This may affect the treatment of the SID with
+ *      respect to inheritence of an owner.
+ *
+ * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in
+ *      the Group field was provided by a defaulting mechanism rather than
+ *      explicitly provided by the original provider of the security
+ *      descriptor.  This may affect the treatment of the SID with respect to
+ *      inheritence of a primary group.
+ *
+ * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security
+ *      descriptor contains a discretionary ACL.  If this flag is set and the
+ *      Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is
+ *      explicitly being specified.
+ *
+ * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
+ *      pointed to by the Dacl field was provided by a defaulting mechanism
+ *      rather than explicitly provided by the original provider of the
+ *      security descriptor.  This may affect the treatment of the ACL with
+ *      respect to inheritence of an ACL.  This flag is ignored if the
+ *      DaclPresent flag is not set.
+ *
+ * SE_SACL_PRESENT - This boolean flag, when set,  indicates that the security
+ *      descriptor contains a system ACL pointed to by the Sacl field.  If this
+ *      flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then
+ *      an empty (but present) ACL is being specified.
+ *
+ * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
+ *      pointed to by the Sacl field was provided by a defaulting mechanism
+ *      rather than explicitly provided by the original provider of the
+ *      security descriptor.  This may affect the treatment of the ACL with
+ *      respect to inheritence of an ACL.  This flag is ignored if the
+ *      SaclPresent flag is not set.
+ *
+ * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security
+ *      descriptor is in self-relative form.  In this form, all fields of the
+ *      security descriptor are contiguous in memory and all pointer fields are
+ *      expressed as offsets from the beginning of the security descriptor.
+ */
+enum {
+        SE_OWNER_DEFAULTED              = const_cpu_to_le16(0x0001),
+        SE_GROUP_DEFAULTED              = const_cpu_to_le16(0x0002),
+        SE_DACL_PRESENT                 = const_cpu_to_le16(0x0004),
+        SE_DACL_DEFAULTED               = const_cpu_to_le16(0x0008),
+        SE_SACL_PRESENT                 = const_cpu_to_le16(0x0010),
+        SE_SACL_DEFAULTED               = const_cpu_to_le16(0x0020),
+        SE_DACL_AUTO_INHERIT_REQ        = const_cpu_to_le16(0x0100),
+        SE_SACL_AUTO_INHERIT_REQ        = const_cpu_to_le16(0x0200),
+        SE_DACL_AUTO_INHERITED          = const_cpu_to_le16(0x0400),
+        SE_SACL_AUTO_INHERITED          = const_cpu_to_le16(0x0800),
+        SE_DACL_PROTECTED               = const_cpu_to_le16(0x1000),
+        SE_SACL_PROTECTED               = const_cpu_to_le16(0x2000),
+        SE_RM_CONTROL_VALID             = const_cpu_to_le16(0x4000),
+        SE_SELF_RELATIVE                = const_cpu_to_le16(0x8000)
+} __attribute__ ((__packed__));
+typedef le16 SECURITY_DESCRIPTOR_CONTROL;
+/*
+ * Self-relative security descriptor. Contains the owner and group SIDs as well
+ * as the sacl and dacl ACLs inside the security descriptor itself.
+ */
+typedef struct {
+        u8 revision;    /* Revision level of the security descriptor. */
+        u8 alignment;
+        SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of
+                           the descriptor as well as the following fields. */
+        le32 owner;     /* Byte offset to a SID representing an object's
+                           owner. If this is NULL, no owner SID is present in
+                           the descriptor. */
+        le32 group;     /* Byte offset to a SID representing an object's
+                           primary group. If this is NULL, no primary group
+                           SID is present in the descriptor. */
+        le32 sacl;      /* Byte offset to a system ACL. Only valid, if
+                           SE_SACL_PRESENT is set in the control field. If
+                           SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
+                           is specified. */
+        le32 dacl;      /* Byte offset to a discretionary ACL. Only valid, if
+                           SE_DACL_PRESENT is set in the control field. If
+                           SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
+                           (unconditionally granting access) is specified. */
+/* sizeof() = 0x14 bytes */
+} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE;
+/*
+ * Absolute security descriptor. Does not contain the owner and group SIDs, nor
+ * the sacl and dacl ACLs inside the security descriptor. Instead, it contains
+ * pointers to these structures in memory. Obviously, absolute security
+ * descriptors are only useful for in memory representations of security
+ * descriptors. On disk, a self-relative security descriptor is used.
+ */
+typedef struct {
+        u8 revision;    /* Revision level of the security descriptor. */
+        u8 alignment;
+        SECURITY_DESCRIPTOR_CONTROL control;    /* Flags qualifying the type of
+                           the descriptor as well as the following fields. */
+        SID *owner;     /* Points to a SID representing an object's owner. If
+                           this is NULL, no owner SID is present in the
+                           descriptor. */
+        SID *group;     /* Points to a SID representing an object's primary
+                           group. If this is NULL, no primary group SID is
+                           present in the descriptor. */
+        ACL *sacl;      /* Points to a system ACL. Only valid, if
+                           SE_SACL_PRESENT is set in the control field. If
+                           SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
+                           is specified. */
+        ACL *dacl;      /* Points to a discretionary ACL. Only valid, if
+                           SE_DACL_PRESENT is set in the control field. If
+                           SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
+                           (unconditionally granting access) is specified. */
+} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR;
+/*
+ * Current constants for security descriptors.
+ */
+typedef enum {
+        /* Current revision. */
+        SECURITY_DESCRIPTOR_REVISION    = 1,
+        SECURITY_DESCRIPTOR_REVISION1   = 1,
+        /* The sizes of both the absolute and relative security descriptors is
+           the same as pointers, at least on ia32 architecture are 32-bit. */
+        SECURITY_DESCRIPTOR_MIN_LENGTH  = sizeof(SECURITY_DESCRIPTOR),
+} SECURITY_DESCRIPTOR_CONSTANTS;
+/*
+ * Attribute: Security descriptor (0x50). A standard self-relative security
+ * descriptor.
+ *
+ * NOTE: Can be resident or non-resident.
+ * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally
+ * in FILE_Secure and the correct descriptor is found using the security_id
+ * from the standard information attribute.
+ */
+typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR;
+/*
+ * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one
+ * referenced instance of each unique security descriptor is stored.
+ *
+ * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It
+ * does, however, contain two indexes ($SDH and $SII) as well as a named data
+ * stream ($SDS).
+ *
+ * Every unique security descriptor is assigned a unique security identifier
+ * (security_id, not to be confused with a SID). The security_id is unique for
+ * the NTFS volume and is used as an index into the $SII index, which maps
+ * security_ids to the security descriptor's storage location within the $SDS
+ * data attribute. The $SII index is sorted by ascending security_id.
+ *
+ * A simple hash is computed from each security descriptor. This hash is used
+ * as an index into the $SDH index, which maps security descriptor hashes to
+ * the security descriptor's storage location within the $SDS data attribute.
+ * The $SDH index is sorted by security descriptor hash and is stored in a B+
+ * tree. When searching $SDH (with the intent of determining whether or not a
+ * new security descriptor is already present in the $SDS data stream), if a
+ * matching hash is found, but the security descriptors do not match, the
+ * search in the $SDH index is continued, searching for a next matching hash.
+ *
+ * When a precise match is found, the security_id coresponding to the security
+ * descriptor in the $SDS attribute is read from the found $SDH index entry and
+ * is stored in the $STANDARD_INFORMATION attribute of the file/directory to
+ * which the security descriptor is being applied. The $STANDARD_INFORMATION
+ * attribute is present in all base mft records (i.e. in all files and
+ * directories).
+ *
+ * If a match is not found, the security descriptor is assigned a new unique
+ * security_id and is added to the $SDS data attribute. Then, entries
+ * referencing the this security descriptor in the $SDS data attribute are
+ * added to the $SDH and $SII indexes.
+ *
+ * Note: Entries are never deleted from FILE_Secure, even if nothing
+ * references an entry any more.
+ */
+/*
+ * This header precedes each security descriptor in the $SDS data stream.
+ * This is also the index entry data part of both the $SII and $SDH indexes.
+ */
+typedef struct {
+        le32 hash;        /* Hash of the security descriptor. */
+        le32 security_id; /* The security_id assigned to the descriptor. */
+        le64 offset;      /* Byte offset of this entry in the $SDS stream. */
+        le32 length;      /* Size in bytes of this entry in $SDS stream. */
+} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER;
+/*
+ * The $SDS data stream contains the security descriptors, aligned on 16-byte
+ * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot
+ * cross 256kib boundaries (this restriction is imposed by the Windows cache
+ * manager). Each security descriptor is contained in a SDS_ENTRY structure.
+ * Also, each security descriptor is stored twice in the $SDS stream with a
+ * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size)
+ * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the
+ * the first copy of the security descriptor will be at offset 0x51d0 in the
+ * $SDS data stream and the second copy will be at offset 0x451d0.
+ */
+typedef struct {
+/*Ofs*/
+/*  0   SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like
+                                       unnamed structs. */
+        le32 hash;        /* Hash of the security descriptor. */
+        le32 security_id; /* The security_id assigned to the descriptor. */
+        le64 offset;      /* Byte offset of this entry in the $SDS stream. */
+        le32 length;      /* Size in bytes of this entry in $SDS stream. */
+/* 20*/ SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security
+                                             descriptor. */
+} __attribute__ ((__packed__)) SDS_ENTRY;
+/*
+ * The index entry key used in the $SII index. The collation type is
+ * COLLATION_NTOFS_ULONG.
+ */
+typedef struct {
+        le32 security_id; /* The security_id assigned to the descriptor. */
+} __attribute__ ((__packed__)) SII_INDEX_KEY;
+/*
+ * The index entry key used in the $SDH index. The keys are sorted first by
+ * hash and then by security_id. The collation rule is
+ * COLLATION_NTOFS_SECURITY_HASH.
+ */
+typedef struct {
+        le32 hash;        /* Hash of the security descriptor. */
+        le32 security_id; /* The security_id assigned to the descriptor. */
+} __attribute__ ((__packed__)) SDH_INDEX_KEY;
+/*
+ * Attribute: Volume name (0x60).
+ *
+ * NOTE: Always resident.
+ * NOTE: Present only in FILE_Volume.
+ */
+typedef struct {
+        ntfschar name[0];       /* The name of the volume in Unicode. */
+} __attribute__ ((__packed__)) VOLUME_NAME;
+/*
+ * Possible flags for the volume (16-bit).
+ */
+enum {
+        VOLUME_IS_DIRTY                 = const_cpu_to_le16(0x0001),
+        VOLUME_RESIZE_LOG_FILE          = const_cpu_to_le16(0x0002),
+        VOLUME_UPGRADE_ON_MOUNT         = const_cpu_to_le16(0x0004),
+        VOLUME_MOUNTED_ON_NT4           = const_cpu_to_le16(0x0008),
+        VOLUME_DELETE_USN_UNDERWAY      = const_cpu_to_le16(0x0010),
+        VOLUME_REPAIR_OBJECT_ID         = const_cpu_to_le16(0x0020),
+        VOLUME_MODIFIED_BY_CHKDSK       = const_cpu_to_le16(0x8000),
+        VOLUME_FLAGS_MASK               = const_cpu_to_le16(0x803f),
+        /* To make our life easier when checking if we must mount read-only. */
+        VOLUME_MUST_MOUNT_RO_MASK       = const_cpu_to_le16(0x8037),
+} __attribute__ ((__packed__));
+typedef le16 VOLUME_FLAGS;
+/*
+ * Attribute: Volume information (0x70).
+ *
+ * NOTE: Always resident.
+ * NOTE: Present only in FILE_Volume.
+ * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses
+ *       NTFS 1.2. I haven't personally seen other values yet.
+ */
+typedef struct {
+        le64 reserved;          /* Not used (yet?). */
+        u8 major_ver;           /* Major version of the ntfs format. */
+        u8 minor_ver;           /* Minor version of the ntfs format. */
+        VOLUME_FLAGS flags;     /* Bit array of VOLUME_* flags. */
+} __attribute__ ((__packed__)) VOLUME_INFORMATION;
+/*
+ * Attribute: Data attribute (0x80).
+ *
+ * NOTE: Can be resident or non-resident.
+ *
+ * Data contents of a file (i.e. the unnamed stream) or of a named stream.
+ */
+typedef struct {
+        u8 data[0];             /* The file's data contents. */
+} __attribute__ ((__packed__)) DATA_ATTR;
+/*
+ * Index header flags (8-bit).
+ */
+enum {
+        /*
+         * When index header is in an index root attribute:
+         */
+        SMALL_INDEX = 0, /* The index is small enough to fit inside the index
+                            root attribute and there is no index allocation
+                            attribute present. */
+        LARGE_INDEX = 1, /* The index is too large to fit in the index root
+                            attribute and/or an index allocation attribute is
+                            present. */
+        /*
+         * When index header is in an index block, i.e. is part of index
+         * allocation attribute:
+         */
+        LEAF_NODE  = 0, /* This is a leaf node, i.e. there are no more nodes
+                           branching off it. */
+        INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf
+                           node. */
+        NODE_MASK  = 1, /* Mask for accessing the *_NODE bits. */
+} __attribute__ ((__packed__));
+typedef u8 INDEX_HEADER_FLAGS;
+/*
+ * This is the header for indexes, describing the INDEX_ENTRY records, which
+ * follow the INDEX_HEADER. Together the index header and the index entries
+ * make up a complete index.
+ *
+ * IMPORTANT NOTE: The offset, length and size structure members are counted
+ * relative to the start of the index header structure and not relative to the
+ * start of the index root or index allocation structures themselves.
+ */
+typedef struct {
+        le32 entries_offset;            /* Byte offset to first INDEX_ENTRY
+                                           aligned to 8-byte boundary. */
+        le32 index_length;              /* Data size of the index in bytes,
+                                           i.e. bytes used from allocated
+                                           size, aligned to 8-byte boundary. */
+        le32 allocated_size;            /* Byte size of this index (block),
+                                           multiple of 8 bytes. */
+        /* NOTE: For the index root attribute, the above two numbers are always
+           equal, as the attribute is resident and it is resized as needed. In
+           the case of the index allocation attribute the attribute is not
+           resident and hence the allocated_size is a fixed value and must
+           equal the index_block_size specified by the INDEX_ROOT attribute
+           corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK
+           belongs to. */
+        INDEX_HEADER_FLAGS flags;       /* Bit field of INDEX_HEADER_FLAGS. */
+        u8 reserved[3];                 /* Reserved/align to 8-byte boundary. */
+} __attribute__ ((__packed__)) INDEX_HEADER;
+/*
+ * Attribute: Index root (0x90).
+ *
+ * NOTE: Always resident.
+ *
+ * This is followed by a sequence of index entries (INDEX_ENTRY structures)
+ * as described by the index header.
+ *
+ * When a directory is small enough to fit inside the index root then this
+ * is the only attribute describing the directory. When the directory is too
+ * large to fit in the index root, on the other hand, two aditional attributes
+ * are present: an index allocation attribute, containing sub-nodes of the B+
+ * directory tree (see below), and a bitmap attribute, describing which virtual
+ * cluster numbers (vcns) in the index allocation attribute are in use by an
+ * index block.
+ *
+ * NOTE: The root directory (FILE_root) contains an entry for itself. Other
+ * dircetories do not contain entries for themselves, though.
+ */
+typedef struct {
+        ATTR_TYPE type;                 /* Type of the indexed attribute. Is
+                                           $FILE_NAME for directories, zero
+                                           for view indexes. No other values
+                                           allowed. */
+        COLLATION_RULE collation_rule;  /* Collation rule used to sort the
+                                           index entries. If type is $FILE_NAME,
+                                           this must be COLLATION_FILE_NAME. */
+        le32 index_block_size;          /* Size of each index block in bytes (in
+                                           the index allocation attribute). */
+        u8 clusters_per_index_block;    /* Cluster size of each index block (in
+                                           the index allocation attribute), when
+                                           an index block is >= than a cluster,
+                                           otherwise this will be the log of
+                                           the size (like how the encoding of
+                                           the mft record size and the index
+                                           record size found in the boot sector
+                                           work). Has to be a power of 2. */
+        u8 reserved[3];                 /* Reserved/align to 8-byte boundary. */
+        INDEX_HEADER index;             /* Index header describing the
+                                           following index entries. */
+} __attribute__ ((__packed__)) INDEX_ROOT;
+/*
+ * Attribute: Index allocation (0xa0).
+ *
+ * NOTE: Always non-resident (doesn't make sense to be resident anyway!).
+ *
+ * This is an array of index blocks. Each index block starts with an
+ * INDEX_BLOCK structure containing an index header, followed by a sequence of
+ * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER.
+ */
+typedef struct {
+/*  0   NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+        NTFS_RECORD_TYPE magic; /* Magic is "INDX". */
+        le16 usa_ofs;           /* See NTFS_RECORD definition. */
+        le16 usa_count;         /* See NTFS_RECORD definition. */
+/*  8*/ sle64 lsn;              /* $LogFile sequence number of the last
+                                   modification of this index block. */
+/* 16*/ leVCN index_block_vcn;  /* Virtual cluster number of the index block.
+                                   If the cluster_size on the volume is <= the
+                                   index_block_size of the directory,
+                                   index_block_vcn counts in units of clusters,
+                                   and in units of sectors otherwise. */
+/* 24*/ INDEX_HEADER index;     /* Describes the following index entries. */
+/* sizeof()= 40 (0x28) bytes */
+/*
+ * When creating the index block, we place the update sequence array at this
+ * offset, i.e. before we start with the index entries. This also makes sense,
+ * otherwise we could run into problems with the update sequence array
+ * containing in itself the last two bytes of a sector which would mean that
+ * multi sector transfer protection wouldn't work. As you can't protect data
+ * by overwriting it since you then can't get it back...
+ * When reading use the data from the ntfs record header.
+ */
+} __attribute__ ((__packed__)) INDEX_BLOCK;
+typedef INDEX_BLOCK INDEX_ALLOCATION;
+/*
+ * The system file FILE_Extend/$Reparse contains an index named $R listing
+ * all reparse points on the volume. The index entry keys are as defined
+ * below. Note, that there is no index data associated with the index entries.
+ *
+ * The index entries are sorted by the index key file_id. The collation rule is
+ * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the
+ * primary key / is not a key at all. (AIA)
+ */
+typedef struct {
+        le32 reparse_tag;       /* Reparse point type (inc. flags). */
+        leMFT_REF file_id;      /* Mft record of the file containing the
+                                   reparse point attribute. */
+} __attribute__ ((__packed__)) REPARSE_INDEX_KEY;
+/*
+ * Quota flags (32-bit).
+ *
+ * The user quota flags.  Names explain meaning.
+ */
+enum {
+        QUOTA_FLAG_DEFAULT_LIMITS       = const_cpu_to_le32(0x00000001),
+        QUOTA_FLAG_LIMIT_REACHED        = const_cpu_to_le32(0x00000002),
+        QUOTA_FLAG_ID_DELETED           = const_cpu_to_le32(0x00000004),
+        QUOTA_FLAG_USER_MASK            = const_cpu_to_le32(0x00000007),
+        /* This is a bit mask for the user quota flags. */
+        /*
+         * These flags are only present in the quota defaults index entry, i.e.
+         * in the entry where owner_id = QUOTA_DEFAULTS_ID.
+         */
+        QUOTA_FLAG_TRACKING_ENABLED     = const_cpu_to_le32(0x00000010),
+        QUOTA_FLAG_ENFORCEMENT_ENABLED  = const_cpu_to_le32(0x00000020),
+        QUOTA_FLAG_TRACKING_REQUESTED   = const_cpu_to_le32(0x00000040),
+        QUOTA_FLAG_LOG_THRESHOLD        = const_cpu_to_le32(0x00000080),
+        QUOTA_FLAG_LOG_LIMIT            = const_cpu_to_le32(0x00000100),
+        QUOTA_FLAG_OUT_OF_DATE          = const_cpu_to_le32(0x00000200),
+        QUOTA_FLAG_CORRUPT              = const_cpu_to_le32(0x00000400),
+        QUOTA_FLAG_PENDING_DELETES      = const_cpu_to_le32(0x00000800),
+};
+typedef le32 QUOTA_FLAGS;
+/*
+ * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas
+ * are on a per volume and per user basis.
+ *
+ * The $Q index contains one entry for each existing user_id on the volume. The
+ * index key is the user_id of the user/group owning this quota control entry,
+ * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the
+ * owner_id, is found in the standard information attribute. The collation rule
+ * for $Q is COLLATION_NTOFS_ULONG.
+ *
+ * The $O index contains one entry for each user/group who has been assigned
+ * a quota on that volume. The index key holds the SID of the user_id the
+ * entry belongs to, i.e. the owner_id. The collation rule for $O is
+ * COLLATION_NTOFS_SID.
+ *
+ * The $O index entry data is the user_id of the user corresponding to the SID.
+ * This user_id is used as an index into $Q to find the quota control entry
+ * associated with the SID.
+ *
+ * The $Q index entry data is the quota control entry and is defined below.
+ */
+typedef struct {
+        le32 version;           /* Currently equals 2. */
+        QUOTA_FLAGS flags;      /* Flags describing this quota entry. */
+        le64 bytes_used;        /* How many bytes of the quota are in use. */
+        sle64 change_time;      /* Last time this quota entry was changed. */
+        sle64 threshold;        /* Soft quota (-1 if not limited). */
+        sle64 limit;            /* Hard quota (-1 if not limited). */
+        sle64 exceeded_time;    /* How long the soft quota has been exceeded. */
+        SID sid;                /* The SID of the user/object associated with
+                                   this quota entry.  Equals zero for the quota
+                                   defaults entry (and in fact on a WinXP
+                                   volume, it is not present at all). */
+} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY;
+/*
+ * Predefined owner_id values (32-bit).
+ */
+enum {
+        QUOTA_INVALID_ID        = const_cpu_to_le32(0x00000000),
+        QUOTA_DEFAULTS_ID       = const_cpu_to_le32(0x00000001),
+        QUOTA_FIRST_USER_ID     = const_cpu_to_le32(0x00000100),
+};
+/*
+ * Current constants for quota control entries.
+ */
+typedef enum {
+        /* Current version. */
+        QUOTA_VERSION   = 2,
+} QUOTA_CONTROL_ENTRY_CONSTANTS;
+/*
+ * Index entry flags (16-bit).
+ */
+enum {
+        INDEX_ENTRY_NODE = const_cpu_to_le16(1), /* This entry contains a
+                        sub-node, i.e. a reference to an index block in form of
+                        a virtual cluster number (see below). */
+        INDEX_ENTRY_END  = const_cpu_to_le16(2), /* This signifies the last
+                        entry in an index block.  The index entry does not
+                        represent a file but it can point to a sub-node. */
+        INDEX_ENTRY_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force
+                        enum bit width to 16-bit. */
+} __attribute__ ((__packed__));
+typedef le16 INDEX_ENTRY_FLAGS;
+/*
+ * This the index entry header (see below).
+ */
+typedef struct {
+/*  0*/ union {
+                struct { /* Only valid when INDEX_ENTRY_END is not set. */
+                        leMFT_REF indexed_file; /* The mft reference of the file
+                                                   described by this index
+                                                   entry. Used for directory
+                                                   indexes. */
+                } __attribute__ ((__packed__)) dir;
+                struct { /* Used for views/indexes to find the entry's data. */
+                        le16 data_offset;       /* Data byte offset from this
+                                                   INDEX_ENTRY. Follows the
+                                                   index key. */
+                        le16 data_length;       /* Data length in bytes. */
+                        le32 reservedV;         /* Reserved (zero). */
+                } __attribute__ ((__packed__)) vi;
+        } __attribute__ ((__packed__)) data;
+/*  8*/ le16 length;             /* Byte size of this index entry, multiple of
+                                    8-bytes. */
+/* 10*/ le16 key_length;         /* Byte size of the key value, which is in the
+                                    index entry. It follows field reserved. Not
+                                    multiple of 8-bytes. */
+/* 12*/ INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
+/* 14*/ le16 reserved;           /* Reserved/align to 8-byte boundary. */
+/* sizeof() = 16 bytes */
+} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER;
+/*
+ * This is an index entry. A sequence of such entries follows each INDEX_HEADER
+ * structure. Together they make up a complete index. The index follows either
+ * an index root attribute or an index allocation attribute.
+ *
+ * NOTE: Before NTFS 3.0 only filename attributes were indexed.
+ */
+typedef struct {
+/*Ofs*/
+/*  0   INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */
+        union {
+                struct { /* Only valid when INDEX_ENTRY_END is not set. */
+                        leMFT_REF indexed_file; /* The mft reference of the file
+                                                   described by this index
+                                                   entry. Used for directory
+                                                   indexes. */
+                } __attribute__ ((__packed__)) dir;
+                struct { /* Used for views/indexes to find the entry's data. */
+                        le16 data_offset;       /* Data byte offset from this
+                                                   INDEX_ENTRY. Follows the
+                                                   index key. */
+                        le16 data_length;       /* Data length in bytes. */
+                        le32 reservedV;         /* Reserved (zero). */
+                } __attribute__ ((__packed__)) vi;
+        } __attribute__ ((__packed__)) data;
+        le16 length;             /* Byte size of this index entry, multiple of
+                                    8-bytes. */
+        le16 key_length;         /* Byte size of the key value, which is in the
+                                    index entry. It follows field reserved. Not
+                                    multiple of 8-bytes. */
+        INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
+        le16 reserved;           /* Reserved/align to 8-byte boundary. */
+/* 16*/ union {         /* The key of the indexed attribute. NOTE: Only present
+                           if INDEX_ENTRY_END bit in flags is not set. NOTE: On
+                           NTFS versions before 3.0 the only valid key is the
+                           FILE_NAME_ATTR. On NTFS 3.0+ the following
+                           additional index keys are defined: */
+                FILE_NAME_ATTR file_name;/* $I30 index in directories. */
+                SII_INDEX_KEY sii;      /* $SII index in $Secure. */
+                SDH_INDEX_KEY sdh;      /* $SDH index in $Secure. */
+                GUID object_id;         /* $O index in FILE_Extend/$ObjId: The
+                                           object_id of the mft record found in
+                                           the data part of the index. */
+                REPARSE_INDEX_KEY reparse;      /* $R index in
+                                                   FILE_Extend/$Reparse. */
+                SID sid;                /* $O index in FILE_Extend/$Quota:
+                                           SID of the owner of the user_id. */
+                le32 owner_id;          /* $Q index in FILE_Extend/$Quota:
+                                           user_id of the owner of the quota
+                                           control entry in the data part of
+                                           the index. */
+        } __attribute__ ((__packed__)) key;
+        /* The (optional) index data is inserted here when creating. */
+        // leVCN vcn;   /* If INDEX_ENTRY_NODE bit in flags is set, the last
+        //                 eight bytes of this index entry contain the virtual
+        //                 cluster number of the index block that holds the
+        //                 entries immediately preceding the current entry (the
+        //                 vcn references the corresponding cluster in the data
+        //                 of the non-resident index allocation attribute). If
+        //                 the key_length is zero, then the vcn immediately
+        //                 follows the INDEX_ENTRY_HEADER. Regardless of
+        //                 key_length, the address of the 8-byte boundary
+        //                 alligned vcn of INDEX_ENTRY{_HEADER} *ie is given by
+        //                 (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN),
+        //                 where sizeof(VCN) can be hardcoded as 8 if wanted. */
+} __attribute__ ((__packed__)) INDEX_ENTRY;
+/*
+ * Attribute: Bitmap (0xb0).
+ *
+ * Contains an array of bits (aka a bitfield).
+ *
+ * When used in conjunction with the index allocation attribute, each bit
+ * corresponds to one index block within the index allocation attribute. Thus
+ * the number of bits in the bitmap * index block size / cluster size is the
+ * number of clusters in the index allocation attribute.
+ */
+typedef struct {
+        u8 bitmap[0];                   /* Array of bits. */
+} __attribute__ ((__packed__)) BITMAP_ATTR;
+/*
+ * The reparse point tag defines the type of the reparse point. It also
+ * includes several flags, which further describe the reparse point.
+ *
+ * The reparse point tag is an unsigned 32-bit value divided in three parts:
+ *
+ * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of
+ *    the reparse point.
+ * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use.
+ * 3. The most significant three bits are flags describing the reparse point.
+ *    They are defined as follows:
+ *      bit 29: Name surrogate bit. If set, the filename is an alias for
+ *              another object in the system.
+ *      bit 30: High-latency bit. If set, accessing the first byte of data will
+ *              be slow. (E.g. the data is stored on a tape drive.)
+ *      bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User
+ *              defined tags have to use zero here.
+ *
+ * These are the predefined reparse point tags:
+ */
+enum {
+        IO_REPARSE_TAG_IS_ALIAS         = const_cpu_to_le32(0x20000000),
+        IO_REPARSE_TAG_IS_HIGH_LATENCY  = const_cpu_to_le32(0x40000000),
+        IO_REPARSE_TAG_IS_MICROSOFT     = const_cpu_to_le32(0x80000000),
+        IO_REPARSE_TAG_RESERVED_ZERO    = const_cpu_to_le32(0x00000000),
+        IO_REPARSE_TAG_RESERVED_ONE     = const_cpu_to_le32(0x00000001),
+        IO_REPARSE_TAG_RESERVED_RANGE   = const_cpu_to_le32(0x00000001),
+        IO_REPARSE_TAG_NSS              = const_cpu_to_le32(0x68000005),
+        IO_REPARSE_TAG_NSS_RECOVER      = const_cpu_to_le32(0x68000006),
+        IO_REPARSE_TAG_SIS              = const_cpu_to_le32(0x68000007),
+        IO_REPARSE_TAG_DFS              = const_cpu_to_le32(0x68000008),
+        IO_REPARSE_TAG_MOUNT_POINT      = const_cpu_to_le32(0x88000003),
+        IO_REPARSE_TAG_HSM              = const_cpu_to_le32(0xa8000004),
+        IO_REPARSE_TAG_SYMBOLIC_LINK    = const_cpu_to_le32(0xe8000000),
+        IO_REPARSE_TAG_VALID_VALUES     = const_cpu_to_le32(0xe000ffff),
+};
+/*
+ * Attribute: Reparse point (0xc0).
+ *
+ * NOTE: Can be resident or non-resident.
+ */
+typedef struct {
+        le32 reparse_tag;               /* Reparse point type (inc. flags). */
+        le16 reparse_data_length;       /* Byte size of reparse data. */
+        le16 reserved;                  /* Align to 8-byte boundary. */
+        u8 reparse_data[0];             /* Meaning depends on reparse_tag. */
+} __attribute__ ((__packed__)) REPARSE_POINT;
+/*
+ * Attribute: Extended attribute (EA) information (0xd0).
+ *
+ * NOTE: Always resident. (Is this true???)
+ */
+typedef struct {
+        le16 ea_length;         /* Byte size of the packed extended
+                                   attributes. */
+        le16 need_ea_count;     /* The number of extended attributes which have
+                                   the NEED_EA bit set. */
+        le32 ea_query_length;   /* Byte size of the buffer required to query
+                                   the extended attributes when calling
+                                   ZwQueryEaFile() in Windows NT/2k. I.e. the
+                                   byte size of the unpacked extended
+                                   attributes. */
+} __attribute__ ((__packed__)) EA_INFORMATION;
+/*
+ * Extended attribute flags (8-bit).
+ */
+enum {
+        NEED_EA = 0x80
+} __attribute__ ((__packed__));
+typedef u8 EA_FLAGS;
+/*
+ * Attribute: Extended attribute (EA) (0xe0).
+ *
+ * NOTE: Always non-resident. (Is this true?)
+ *
+ * Like the attribute list and the index buffer list, the EA attribute value is
+ * a sequence of EA_ATTR variable length records.
+ *
+ * FIXME: It appears weird that the EA name is not unicode. Is it true?
+ */
+typedef struct {
+        le32 next_entry_offset; /* Offset to the next EA_ATTR. */
+        EA_FLAGS flags;         /* Flags describing the EA. */
+        u8 ea_name_length;      /* Length of the name of the EA in bytes. */
+        le16 ea_value_length;   /* Byte size of the EA's value. */
+        u8 ea_name[0];          /* Name of the EA. */
+        u8 ea_value[0];         /* The value of the EA. Immediately follows
+                                   the name. */
+} __attribute__ ((__packed__)) EA_ATTR;
+/*
+ * Attribute: Property set (0xf0).
+ *
+ * Intended to support Native Structure Storage (NSS) - a feature removed from
+ * NTFS 3.0 during beta testing.
+ */
+typedef struct {
+        /* Irrelevant as feature unused. */
+} __attribute__ ((__packed__)) PROPERTY_SET;
+/*
+ * Attribute: Logged utility stream (0x100).
+ *
+ * NOTE: Can be resident or non-resident.
+ *
+ * Operations on this attribute are logged to the journal ($LogFile) like
+ * normal metadata changes.
+ *
+ * Used by the Encrypting File System (EFS). All encrypted files have this
+ * attribute with the name $EFS.
+ */
+typedef struct {
+        /* Can be anything the creator chooses. */
+        /* EFS uses it as follows: */
+        // FIXME: Type this info, verifying it along the way. (AIA)
+} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR;
+#endif /* _LINUX_NTFS_LAYOUT_H */
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
new file mode 100644
index 000000000000..23fd911078b1
--- /dev/null
+++ b/fs/ntfs/lcnalloc.c
@@ -0,0 +1,1002 @@
+/*
+ * lcnalloc.c - Cluster (de)allocation code.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifdef NTFS_RW
+#include <linux/pagemap.h>
+#include "lcnalloc.h"
+#include "debug.h"
+#include "bitmap.h"
+#include "inode.h"
+#include "volume.h"
+#include "attrib.h"
+#include "malloc.h"
+#include "aops.h"
+#include "ntfs.h"
+/**
+ * ntfs_cluster_free_from_rl_nolock - free clusters from runlist
+ * @vol:        mounted ntfs volume on which to free the clusters
+ * @rl:         runlist describing the clusters to free
+ *
+ * Free all the clusters described by the runlist @rl on the volume @vol.  In
+ * the case of an error being returned, at least some of the clusters were not
+ * freed.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - The volume lcn bitmap must be locked for writing on entry and is
+ *            left locked on return.
+ */
+int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
+                const runlist_element *rl)
+{
+        struct inode *lcnbmp_vi = vol->lcnbmp_ino;
+        int ret = 0;
+        ntfs_debug("Entering.");
+        for (; rl->length; rl++) {
+                int err;
+                if (rl->lcn < 0)
+                        continue;
+                err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length);
+                if (unlikely(err && (!ret || ret == ENOMEM) && ret != err))
+                        ret = err;
+        }
+        ntfs_debug("Done.");
+        return ret;
+}
+/**
+ * ntfs_cluster_alloc - allocate clusters on an ntfs volume
+ * @vol:        mounted ntfs volume on which to allocate the clusters
+ * @start_vcn:  vcn to use for the first allocated cluster
+ * @count:      number of clusters to allocate
+ * @start_lcn:  starting lcn at which to allocate the clusters (or -1 if none)
+ * @zone:       zone from which to allocate the clusters
+ *
+ * Allocate @count clusters preferably starting at cluster @start_lcn or at the
+ * current allocator position if @start_lcn is -1, on the mounted ntfs volume
+ * @vol. @zone is either DATA_ZONE for allocation of normal clusters or
+ * MFT_ZONE for allocation of clusters for the master file table, i.e. the
+ * $MFT/$DATA attribute.
+ *
+ * @start_vcn specifies the vcn of the first allocated cluster.  This makes
+ * merging the resulting runlist with the old runlist easier.
+ *
+ * You need to check the return value with IS_ERR().  If this is false, the
+ * function was successful and the return value is a runlist describing the
+ * allocated cluster(s).  If IS_ERR() is true, the function failed and
+ * PTR_ERR() gives you the error code.
+ *
+ * Notes on the allocation algorithm
+ * =================================
+ *
+ * There are two data zones.  First is the area between the end of the mft zone
+ * and the end of the volume, and second is the area between the start of the
+ * volume and the start of the mft zone.  On unmodified/standard NTFS 1.x
+ * volumes, the second data zone does not exist due to the mft zone being
+ * expanded to cover the start of the volume in order to reserve space for the
+ * mft bitmap attribute.
+ *
+ * This is not the prettiest function but the complexity stems from the need of
+ * implementing the mft vs data zoned approach and from the fact that we have
+ * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we
+ * need to cope with crossing over boundaries of two buffers.  Further, the
+ * fact that the allocator allows for caller supplied hints as to the location
+ * of where allocation should begin and the fact that the allocator keeps track
+ * of where in the data zones the next natural allocation should occur,
+ * contribute to the complexity of the function.  But it should all be
+ * worthwhile, because this allocator should: 1) be a full implementation of
+ * the MFT zone approach used by Windows NT, 2) cause reduction in
+ * fragmentation, and 3) be speedy in allocations (the code is not optimized
+ * for speed, but the algorithm is, so further speed improvements are probably
+ * possible).
+ *
+ * FIXME: We should be monitoring cluster allocation and increment the MFT zone
+ * size dynamically but this is something for the future.  We will just cause
+ * heavier fragmentation by not doing it and I am not even sure Windows would
+ * grow the MFT zone dynamically, so it might even be correct not to do this.
+ * The overhead in doing dynamic MFT zone expansion would be very large and
+ * unlikely worth the effort. (AIA)
+ *
+ * TODO: I have added in double the required zone position pointer wrap around
+ * logic which can be optimized to having only one of the two logic sets.
+ * However, having the double logic will work fine, but if we have only one of
+ * the sets and we get it wrong somewhere, then we get into trouble, so
+ * removing the duplicate logic requires _very_ careful consideration of _all_
+ * possible code paths.  So at least for now, I am leaving the double logic -
+ * better safe than sorry... (AIA)
+ *
+ * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked
+ *            on return.
+ *          - This function takes the volume lcn bitmap lock for writing and
+ *            modifies the bitmap contents.
+ */
+runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
+                const s64 count, const LCN start_lcn,
+                const NTFS_CLUSTER_ALLOCATION_ZONES zone)
+{
+        LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn;
+        LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size;
+        s64 clusters;
+        struct inode *lcnbmp_vi;
+        runlist_element *rl = NULL;
+        struct address_space *mapping;
+        struct page *page = NULL;
+        u8 *buf, *byte;
+        int err = 0, rlpos, rlsize, buf_size;
+        u8 pass, done_zones, search_zone, need_writeback = 0, bit;
+        ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn "
+                        "0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn,
+                        (unsigned long long)count,
+                        (unsigned long long)start_lcn,
+                        zone == MFT_ZONE ? "MFT" : "DATA");
+        BUG_ON(!vol);
+        lcnbmp_vi = vol->lcnbmp_ino;
+        BUG_ON(!lcnbmp_vi);
+        BUG_ON(start_vcn < 0);
+        BUG_ON(count < 0);
+        BUG_ON(start_lcn < -1);
+        BUG_ON(zone < FIRST_ZONE);
+        BUG_ON(zone > LAST_ZONE);
+        /* Return empty runlist if @count == 0 */
+        // FIXME: Do we want to just return NULL instead? (AIA)
+        if (!count) {
+                rl = ntfs_malloc_nofs(PAGE_SIZE);
+                if (!rl)
+                        return ERR_PTR(-ENOMEM);
+                rl[0].vcn = start_vcn;
+                rl[0].lcn = LCN_RL_NOT_MAPPED;
+                rl[0].length = 0;
+                return rl;
+        }
+        /* Take the lcnbmp lock for writing. */
+        down_write(&vol->lcnbmp_lock);
+        /*
+         * If no specific @start_lcn was requested, use the current data zone
+         * position, otherwise use the requested @start_lcn but make sure it
+         * lies outside the mft zone.  Also set done_zones to 0 (no zones done)
+         * and pass depending on whether we are starting inside a zone (1) or
+         * at the beginning of a zone (2).  If requesting from the MFT_ZONE,
+         * we either start at the current position within the mft zone or at
+         * the specified position.  If the latter is out of bounds then we start
+         * at the beginning of the MFT_ZONE.
+         */
+        done_zones = 0;
+        pass = 1;
+        /*
+         * zone_start and zone_end are the current search range.  search_zone
+         * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of
+         * volume) and 4 for data zone 2 (start of volume till start of mft
+         * zone).
+         */
+        zone_start = start_lcn;
+        if (zone_start < 0) {
+                if (zone == DATA_ZONE)
+                        zone_start = vol->data1_zone_pos;
+                else
+                        zone_start = vol->mft_zone_pos;
+                if (!zone_start) {
+                        /*
+                         * Zone starts at beginning of volume which means a
+                         * single pass is sufficient.
+                         */
+                        pass = 2;
+                }
+        } else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start &&
+                        zone_start < vol->mft_zone_end) {
+                zone_start = vol->mft_zone_end;
+                /*
+                 * Starting at beginning of data1_zone which means a single
+                 * pass in this zone is sufficient.
+                 */
+                pass = 2;
+        } else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start ||
+                        zone_start >= vol->mft_zone_end)) {
+                zone_start = vol->mft_lcn;
+                if (!vol->mft_zone_end)
+                        zone_start = 0;
+                /*
+                 * Starting at beginning of volume which means a single pass
+                 * is sufficient.
+                 */
+                pass = 2;
+        }
+        if (zone == MFT_ZONE) {
+                zone_end = vol->mft_zone_end;
+                search_zone = 1;
+        } else /* if (zone == DATA_ZONE) */ {
+                /* Skip searching the mft zone. */
+                done_zones |= 1;
+                if (zone_start >= vol->mft_zone_end) {
+                        zone_end = vol->nr_clusters;
+                        search_zone = 2;
+                } else {
+                        zone_end = vol->mft_zone_start;
+                        search_zone = 4;
+                }
+        }
+        /*
+         * bmp_pos is the current bit position inside the bitmap.  We use
+         * bmp_initial_pos to determine whether or not to do a zone switch.
+         */
+        bmp_pos = bmp_initial_pos = zone_start;
+        /* Loop until all clusters are allocated, i.e. clusters == 0. */
+        clusters = count;
+        rlpos = rlsize = 0;
+        mapping = lcnbmp_vi->i_mapping;
+        while (1) {
+                ntfs_debug("Start of outer while loop: done_zones 0x%x, "
+                                "search_zone %i, pass %i, zone_start 0x%llx, "
+                                "zone_end 0x%llx, bmp_initial_pos 0x%llx, "
+                                "bmp_pos 0x%llx, rlpos %i, rlsize %i.",
+                                done_zones, search_zone, pass,
+                                (unsigned long long)zone_start,
+                                (unsigned long long)zone_end,
+                                (unsigned long long)bmp_initial_pos,
+                                (unsigned long long)bmp_pos, rlpos, rlsize);
+                /* Loop until we run out of free clusters. */
+                last_read_pos = bmp_pos >> 3;
+                ntfs_debug("last_read_pos 0x%llx.",
+                                (unsigned long long)last_read_pos);
+                if (last_read_pos > lcnbmp_vi->i_size) {
+                        ntfs_debug("End of attribute reached.  "
+                                        "Skipping to zone_pass_done.");
+                        goto zone_pass_done;
+                }
+                if (likely(page)) {
+                        if (need_writeback) {
+                                ntfs_debug("Marking page dirty.");
+                                flush_dcache_page(page);
+                                set_page_dirty(page);
+                                need_writeback = 0;
+                        }
+                        ntfs_unmap_page(page);
+                }
+                page = ntfs_map_page(mapping, last_read_pos >>
+                                PAGE_CACHE_SHIFT);
+                if (IS_ERR(page)) {
+                        err = PTR_ERR(page);
+                        ntfs_error(vol->sb, "Failed to map page.");
+                        goto out;
+                }
+                buf_size = last_read_pos & ~PAGE_CACHE_MASK;
+                buf = page_address(page) + buf_size;
+                buf_size = PAGE_CACHE_SIZE - buf_size;
+                if (unlikely(last_read_pos + buf_size > lcnbmp_vi->i_size))
+                        buf_size = lcnbmp_vi->i_size - last_read_pos;
+                buf_size <<= 3;
+                lcn = bmp_pos & 7;
+                bmp_pos &= ~7;
+                ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, "
+                                "bmp_pos 0x%llx, need_writeback %i.", buf_size,
+                                (unsigned long long)lcn,
+                                (unsigned long long)bmp_pos, need_writeback);
+                while (lcn < buf_size && lcn + bmp_pos < zone_end) {
+                        byte = buf + (lcn >> 3);
+                        ntfs_debug("In inner while loop: buf_size %i, "
+                                        "lcn 0x%llx, bmp_pos 0x%llx, "
+                                        "need_writeback %i, byte ofs 0x%x, "
+                                        "*byte 0x%x.", buf_size,
+                                        (unsigned long long)lcn,
+                                        (unsigned long long)bmp_pos,
+                                        need_writeback,
+                                        (unsigned int)(lcn >> 3),
+                                        (unsigned int)*byte);
+                        /* Skip full bytes. */
+                        if (*byte == 0xff) {
+                                lcn = (lcn + 8) & ~7;
+                                ntfs_debug("Continuing while loop 1.");
+                                continue;
+                        }
+                        bit = 1 << (lcn & 7);
+                        ntfs_debug("bit %i.", bit);
+                        /* If the bit is already set, go onto the next one. */
+                        if (*byte & bit) {
+                                lcn++;
+                                ntfs_debug("Continuing while loop 2.");
+                                continue;
+                        }
+                        /*
+                         * Allocate more memory if needed, including space for
+                         * the terminator element.
+                         * ntfs_malloc_nofs() operates on whole pages only.
+                         */
+                        if ((rlpos + 2) * sizeof(*rl) > rlsize) {
+                                runlist_element *rl2;
+                                ntfs_debug("Reallocating memory.");
+                                if (!rl)
+                                        ntfs_debug("First free bit is at LCN "
+                                                        "0x%llx.",
+                                                        (unsigned long long)
+                                                        (lcn + bmp_pos));
+                                rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
+                                if (unlikely(!rl2)) {
+                                        err = -ENOMEM;
+                                        ntfs_error(vol->sb, "Failed to "
+                                                        "allocate memory.");
+                                        goto out;
+                                }
+                                memcpy(rl2, rl, rlsize);
+                                ntfs_free(rl);
+                                rl = rl2;
+                                rlsize += PAGE_SIZE;
+                                ntfs_debug("Reallocated memory, rlsize 0x%x.",
+                                                rlsize);
+                        }
+                        /* Allocate the bitmap bit. */
+                        *byte |= bit;
+                        /* We need to write this bitmap page to disk. */
+                        need_writeback = 1;
+                        ntfs_debug("*byte 0x%x, need_writeback is set.",
+                                        (unsigned int)*byte);
+                        /*
+                         * Coalesce with previous run if adjacent LCNs.
+                         * Otherwise, append a new run.
+                         */
+                        ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), "
+                                        "prev_lcn 0x%llx, lcn 0x%llx, "
+                                        "bmp_pos 0x%llx, prev_run_len 0x%llx, "
+                                        "rlpos %i.",
+                                        (unsigned long long)(lcn + bmp_pos),
+                                        1ULL, (unsigned long long)prev_lcn,
+                                        (unsigned long long)lcn,
+                                        (unsigned long long)bmp_pos,
+                                        (unsigned long long)prev_run_len,
+                                        rlpos);
+                        if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) {
+                                ntfs_debug("Coalescing to run (lcn 0x%llx, "
+                                                "len 0x%llx).",
+                                                (unsigned long long)
+                                                rl[rlpos - 1].lcn,
+                                                (unsigned long long)
+                                                rl[rlpos - 1].length);
+                                rl[rlpos - 1].length = ++prev_run_len;
+                                ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), "
+                                                "prev_run_len 0x%llx.",
+                                                (unsigned long long)
+                                                rl[rlpos - 1].lcn,
+                                                (unsigned long long)
+                                                rl[rlpos - 1].length,
+                                                (unsigned long long)
+                                                prev_run_len);
+                        } else {
+                                if (likely(rlpos)) {
+                                        ntfs_debug("Adding new run, (previous "
+                                                        "run lcn 0x%llx, "
+                                                        "len 0x%llx).",
+                                                        (unsigned long long)
+                                                        rl[rlpos - 1].lcn,
+                                                        (unsigned long long)
+                                                        rl[rlpos - 1].length);
+                                        rl[rlpos].vcn = rl[rlpos - 1].vcn +
+                                                        prev_run_len;
+                                } else {
+                                        ntfs_debug("Adding new run, is first "
+                                                        "run.");
+                                        rl[rlpos].vcn = start_vcn;
+                                }
+                                rl[rlpos].lcn = prev_lcn = lcn + bmp_pos;
+                                rl[rlpos].length = prev_run_len = 1;
+                                rlpos++;
+                        }
+                        /* Done? */
+                        if (!--clusters) {
+                                LCN tc;
+                                /*
+                                 * Update the current zone position.  Positions
+                                 * of already scanned zones have been updated
+                                 * during the respective zone switches.
+                                 */
+                                tc = lcn + bmp_pos + 1;
+                                ntfs_debug("Done. Updating current zone "
+                                                "position, tc 0x%llx, "
+                                                "search_zone %i.",
+                                                (unsigned long long)tc,
+                                                search_zone);
+                                switch (search_zone) {
+                                case 1:
+                                        ntfs_debug("Before checks, "
+                                                        "vol->mft_zone_pos "
+                                                        "0x%llx.",
+                                                        (unsigned long long)
+                                                        vol->mft_zone_pos);
+                                        if (tc >= vol->mft_zone_end) {
+                                                vol->mft_zone_pos =
+                                                                vol->mft_lcn;
+                                                if (!vol->mft_zone_end)
+                                                        vol->mft_zone_pos = 0;
+                                        } else if ((bmp_initial_pos >=
+                                                        vol->mft_zone_pos ||
+                                                        tc > vol->mft_zone_pos)
+                                                        && tc >= vol->mft_lcn)
+                                                vol->mft_zone_pos = tc;
+                                        ntfs_debug("After checks, "
+                                                        "vol->mft_zone_pos "
+                                                        "0x%llx.",
+                                                        (unsigned long long)
+                                                        vol->mft_zone_pos);
+                                        break;
+                                case 2:
+                                        ntfs_debug("Before checks, "
+                                                        "vol->data1_zone_pos "
+                                                        "0x%llx.",
+                                                        (unsigned long long)
+                                                        vol->data1_zone_pos);
+                                        if (tc >= vol->nr_clusters)
+                                                vol->data1_zone_pos =
+                                                             vol->mft_zone_end;
+                                        else if ((bmp_initial_pos >=
+                                                    vol->data1_zone_pos ||
+                                                    tc > vol->data1_zone_pos)
+                                                    && tc >= vol->mft_zone_end)
+                                                vol->data1_zone_pos = tc;
+                                        ntfs_debug("After checks, "
+                                                        "vol->data1_zone_pos "
+                                                        "0x%llx.",
+                                                        (unsigned long long)
+                                                        vol->data1_zone_pos);
+                                        break;
+                                case 4:
+                                        ntfs_debug("Before checks, "
+                                                        "vol->data2_zone_pos "
+                                                        "0x%llx.",
+                                                        (unsigned long long)
+                                                        vol->data2_zone_pos);
+                                        if (tc >= vol->mft_zone_start)
+                                                vol->data2_zone_pos = 0;
+                                        else if (bmp_initial_pos >=
+                                                      vol->data2_zone_pos ||
+                                                      tc > vol->data2_zone_pos)
+                                                vol->data2_zone_pos = tc;
+                                        ntfs_debug("After checks, "
+                                                        "vol->data2_zone_pos "
+                                                        "0x%llx.",
+                                                        (unsigned long long)
+                                                        vol->data2_zone_pos);
+                                        break;
+                                default:
+                                        BUG();
+                                }
+                                ntfs_debug("Finished.  Going to out.");
+                                goto out;
+                        }
+                        lcn++;
+                }
+                bmp_pos += buf_size;
+                ntfs_debug("After inner while loop: buf_size 0x%x, lcn "
+                                "0x%llx, bmp_pos 0x%llx, need_writeback %i.",
+                                buf_size, (unsigned long long)lcn,
+                                (unsigned long long)bmp_pos, need_writeback);
+                if (bmp_pos < zone_end) {
+                        ntfs_debug("Continuing outer while loop, "
+                                        "bmp_pos 0x%llx, zone_end 0x%llx.",
+                                        (unsigned long long)bmp_pos,
+                                        (unsigned long long)zone_end);
+                        continue;
+                }
+zone_pass_done: /* Finished with the current zone pass. */
+                ntfs_debug("At zone_pass_done, pass %i.", pass);
+                if (pass == 1) {
+                        /*
+                         * Now do pass 2, scanning the first part of the zone
+                         * we omitted in pass 1.
+                         */
+                        pass = 2;
+                        zone_end = zone_start;
+                        switch (search_zone) {
+                        case 1: /* mft_zone */
+                                zone_start = vol->mft_zone_start;
+                                break;
+                        case 2: /* data1_zone */
+                                zone_start = vol->mft_zone_end;
+                                break;
+                        case 4: /* data2_zone */
+                                zone_start = 0;
+                                break;
+                        default:
+                                BUG();
+                        }
+                        /* Sanity check. */
+                        if (zone_end < zone_start)
+                                zone_end = zone_start;
+                        bmp_pos = zone_start;
+                        ntfs_debug("Continuing outer while loop, pass 2, "
+                                        "zone_start 0x%llx, zone_end 0x%llx, "
+                                        "bmp_pos 0x%llx.",
+                                        (unsigned long long)zone_start,
+                                        (unsigned long long)zone_end,
+                                        (unsigned long long)bmp_pos);
+                        continue;
+                } /* pass == 2 */
+done_zones_check:
+                ntfs_debug("At done_zones_check, search_zone %i, done_zones "
+                                "before 0x%x, done_zones after 0x%x.",
+                                search_zone, done_zones,
+                                done_zones | search_zone);
+                done_zones |= search_zone;
+                if (done_zones < 7) {
+                        ntfs_debug("Switching zone.");
+                        /* Now switch to the next zone we haven't done yet. */
+                        pass = 1;
+                        switch (search_zone) {
+                        case 1:
+                                ntfs_debug("Switching from mft zone to data1 "
+                                                "zone.");
+                                /* Update mft zone position. */
+                                if (rlpos) {
+                                        LCN tc;
+                                        ntfs_debug("Before checks, "
+                                                        "vol->mft_zone_pos "
+                                                        "0x%llx.",
+                                                        (unsigned long long)
+                                                        vol->mft_zone_pos);
+                                        tc = rl[rlpos - 1].lcn +
+                                                        rl[rlpos - 1].length;
+                                        if (tc >= vol->mft_zone_end) {
+                                                vol->mft_zone_pos =
+                                                                vol->mft_lcn;
+                                                if (!vol->mft_zone_end)
+                                                        vol->mft_zone_pos = 0;
+                                        } else if ((bmp_initial_pos >=
+                                                        vol->mft_zone_pos ||
+                                                        tc > vol->mft_zone_pos)
+                                                        && tc >= vol->mft_lcn)
+                                                vol->mft_zone_pos = tc;
+                                        ntfs_debug("After checks, "
+                                                        "vol->mft_zone_pos "
+                                                        "0x%llx.",
+                                                        (unsigned long long)
+                                                        vol->mft_zone_pos);
+                                }
+                                /* Switch from mft zone to data1 zone. */
+switch_to_data1_zone:           search_zone = 2;
+                                zone_start = bmp_initial_pos =
+                                                vol->data1_zone_pos;
+                                zone_end = vol->nr_clusters;
+                                if (zone_start == vol->mft_zone_end)
+                                        pass = 2;
+                                if (zone_start >= zone_end) {
+                                        vol->data1_zone_pos = zone_start =
+                                                        vol->mft_zone_end;
+                                        pass = 2;
+                                }
+                                break;
+                        case 2:
+                                ntfs_debug("Switching from data1 zone to "
+                                                "data2 zone.");
+                                /* Update data1 zone position. */
+                                if (rlpos) {
+                                        LCN tc;
+                                        ntfs_debug("Before checks, "
+                                                        "vol->data1_zone_pos "
+                                                        "0x%llx.",
+                                                        (unsigned long long)
+                                                        vol->data1_zone_pos);
+                                        tc = rl[rlpos - 1].lcn +
+                                                        rl[rlpos - 1].length;
+                                        if (tc >= vol->nr_clusters)
+                                                vol->data1_zone_pos =
+                                                             vol->mft_zone_end;
+                                        else if ((bmp_initial_pos >=
+                                                    vol->data1_zone_pos ||
+                                                    tc > vol->data1_zone_pos)
+                                                    && tc >= vol->mft_zone_end)
+                                                vol->data1_zone_pos = tc;
+                                        ntfs_debug("After checks, "
+                                                        "vol->data1_zone_pos "
+                                                        "0x%llx.",
+                                                        (unsigned long long)
+                                                        vol->data1_zone_pos);
+                                }
+                                /* Switch from data1 zone to data2 zone. */
+                                search_zone = 4;
+                                zone_start = bmp_initial_pos =
+                                                vol->data2_zone_pos;
+                                zone_end = vol->mft_zone_start;
+                                if (!zone_start)
+                                        pass = 2;
+                                if (zone_start >= zone_end) {
+                                        vol->data2_zone_pos = zone_start =
+                                                        bmp_initial_pos = 0;
+                                        pass = 2;
+                                }
+                                break;
+                        case 4:
+                                ntfs_debug("Switching from data2 zone to "
+                                                "data1 zone.");
+                                /* Update data2 zone position. */
+                                if (rlpos) {
+                                        LCN tc;
+                                        ntfs_debug("Before checks, "
+                                                        "vol->data2_zone_pos "
+                                                        "0x%llx.",
+                                                        (unsigned long long)
+                                                        vol->data2_zone_pos);
+                                        tc = rl[rlpos - 1].lcn +
+                                                        rl[rlpos - 1].length;
+                                        if (tc >= vol->mft_zone_start)
+                                                vol->data2_zone_pos = 0;
+                                        else if (bmp_initial_pos >=
+                                                      vol->data2_zone_pos ||
+                                                      tc > vol->data2_zone_pos)
+                                                vol->data2_zone_pos = tc;
+                                        ntfs_debug("After checks, "
+                                                        "vol->data2_zone_pos "
+                                                        "0x%llx.",
+                                                        (unsigned long long)
+                                                        vol->data2_zone_pos);
+                                }
+                                /* Switch from data2 zone to data1 zone. */
+                                goto switch_to_data1_zone;
+                        default:
+                                BUG();
+                        }
+                        ntfs_debug("After zone switch, search_zone %i, "
+                                        "pass %i, bmp_initial_pos 0x%llx, "
+                                        "zone_start 0x%llx, zone_end 0x%llx.",
+                                        search_zone, pass,
+                                        (unsigned long long)bmp_initial_pos,
+                                        (unsigned long long)zone_start,
+                                        (unsigned long long)zone_end);
+                        bmp_pos = zone_start;
+                        if (zone_start == zone_end) {
+                                ntfs_debug("Empty zone, going to "
+                                                "done_zones_check.");
+                                /* Empty zone. Don't bother searching it. */
+                                goto done_zones_check;
+                        }
+                        ntfs_debug("Continuing outer while loop.");
+                        continue;
+                } /* done_zones == 7 */
+                ntfs_debug("All zones are finished.");
+                /*
+                 * All zones are finished!  If DATA_ZONE, shrink mft zone.  If
+                 * MFT_ZONE, we have really run out of space.
+                 */
+                mft_zone_size = vol->mft_zone_end - vol->mft_zone_start;
+                ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end "
+                                "0x%llx, mft_zone_size 0x%llx.",
+                                (unsigned long long)vol->mft_zone_start,
+                                (unsigned long long)vol->mft_zone_end,
+                                (unsigned long long)mft_zone_size);
+                if (zone == MFT_ZONE || mft_zone_size <= 0) {
+                        ntfs_debug("No free clusters left, going to out.");
+                        /* Really no more space left on device. */
+                        err = ENOSPC;
+                        goto out;
+                } /* zone == DATA_ZONE && mft_zone_size > 0 */
+                ntfs_debug("Shrinking mft zone.");
+                zone_end = vol->mft_zone_end;
+                mft_zone_size >>= 1;
+                if (mft_zone_size > 0)
+                        vol->mft_zone_end = vol->mft_zone_start + mft_zone_size;
+                else /* mft zone and data2 zone no longer exist. */
+                        vol->data2_zone_pos = vol->mft_zone_start =
+                                        vol->mft_zone_end = 0;
+                if (vol->mft_zone_pos >= vol->mft_zone_end) {
+                        vol->mft_zone_pos = vol->mft_lcn;
+                        if (!vol->mft_zone_end)
+                                vol->mft_zone_pos = 0;
+                }
+                bmp_pos = zone_start = bmp_initial_pos =
+                                vol->data1_zone_pos = vol->mft_zone_end;
+                search_zone = 2;
+                pass = 2;
+                done_zones &= ~2;
+                ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, "
+                                "vol->mft_zone_start 0x%llx, "
+                                "vol->mft_zone_end 0x%llx, "
+                                "vol->mft_zone_pos 0x%llx, search_zone 2, "
+                                "pass 2, dones_zones 0x%x, zone_start 0x%llx, "
+                                "zone_end 0x%llx, vol->data1_zone_pos 0x%llx, "
+                                "continuing outer while loop.",
+                                (unsigned long long)mft_zone_size,
+                                (unsigned long long)vol->mft_zone_start,
+                                (unsigned long long)vol->mft_zone_end,
+                                (unsigned long long)vol->mft_zone_pos,
+                                done_zones, (unsigned long long)zone_start,
+                                (unsigned long long)zone_end,
+                                (unsigned long long)vol->data1_zone_pos);
+        }
+        ntfs_debug("After outer while loop.");
+out:
+        ntfs_debug("At out.");
+        /* Add runlist terminator element. */
+        if (likely(rl)) {
+                rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length;
+                rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
+                rl[rlpos].length = 0;
+        }
+        if (likely(page && !IS_ERR(page))) {
+                if (need_writeback) {
+                        ntfs_debug("Marking page dirty.");
+                        flush_dcache_page(page);
+                        set_page_dirty(page);
+                        need_writeback = 0;
+                }
+                ntfs_unmap_page(page);
+        }
+        if (likely(!err)) {
+                up_write(&vol->lcnbmp_lock);
+                ntfs_debug("Done.");
+                return rl;
+        }
+        ntfs_error(vol->sb, "Failed to allocate clusters, aborting "
+                        "(error %i).", err);
+        if (rl) {
+                int err2;
+                if (err == ENOSPC)
+                        ntfs_debug("Not enough space to complete allocation, "
+                                        "err ENOSPC, first free lcn 0x%llx, "
+                                        "could allocate up to 0x%llx "
+                                        "clusters.",
+                                        (unsigned long long)rl[0].lcn,
+                                        (unsigned long long)count - clusters);
+                /* Deallocate all allocated clusters. */
+                ntfs_debug("Attempting rollback...");
+                err2 = ntfs_cluster_free_from_rl_nolock(vol, rl);
+                if (err2) {
+                        ntfs_error(vol->sb, "Failed to rollback (error %i).  "
+                                        "Leaving inconsistent metadata!  "
+                                        "Unmount and run chkdsk.", err2);
+                        NVolSetErrors(vol);
+                }
+                /* Free the runlist. */
+                ntfs_free(rl);
+        } else if (err == ENOSPC)
+                ntfs_debug("No space left at all, err = ENOSPC, "
+                                "first free lcn = 0x%llx.",
+                                (unsigned long long)vol->data1_zone_pos);
+        up_write(&vol->lcnbmp_lock);
+        return ERR_PTR(err);
+}
+/**
+ * __ntfs_cluster_free - free clusters on an ntfs volume
+ * @vi:         vfs inode whose runlist describes the clusters to free
+ * @start_vcn:  vcn in the runlist of @vi at which to start freeing clusters
+ * @count:      number of clusters to free or -1 for all clusters
+ * @is_rollback:        if TRUE this is a rollback operation
+ *
+ * Free @count clusters starting at the cluster @start_vcn in the runlist
+ * described by the vfs inode @vi.
+ *
+ * If @count is -1, all clusters from @start_vcn to the end of the runlist are
+ * deallocated.  Thus, to completely free all clusters in a runlist, use
+ * @start_vcn = 0 and @count = -1.
+ *
+ * @is_rollback should always be FALSE, it is for internal use to rollback
+ * errors.  You probably want to use ntfs_cluster_free() instead.
+ *
+ * Note, ntfs_cluster_free() does not modify the runlist at all, so the caller
+ * has to deal with it later.
+ *
+ * Return the number of deallocated clusters (not counting sparse ones) on
+ * success and -errno on error.
+ *
+ * Locking: - The runlist described by @vi must be unlocked on entry and is
+ *            unlocked on return.
+ *          - This function takes the runlist lock of @vi for reading and
+ *            sometimes for writing and sometimes modifies the runlist.
+ *          - The volume lcn bitmap must be unlocked on entry and is unlocked
+ *            on return.
+ *          - This function takes the volume lcn bitmap lock for writing and
+ *            modifies the bitmap contents.
+ */
+s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
+                const BOOL is_rollback)
+{
+        s64 delta, to_free, total_freed, real_freed;
+        ntfs_inode *ni;
+        ntfs_volume *vol;
+        struct inode *lcnbmp_vi;
+        runlist_element *rl;
+        int err;
+        BUG_ON(!vi);
+        ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count "
+                        "0x%llx.%s", vi->i_ino, (unsigned long long)start_vcn,
+                        (unsigned long long)count,
+                        is_rollback ? " (rollback)" : "");
+        ni = NTFS_I(vi);
+        vol = ni->vol;
+        lcnbmp_vi = vol->lcnbmp_ino;
+        BUG_ON(!lcnbmp_vi);
+        BUG_ON(start_vcn < 0);
+        BUG_ON(count < -1);
+        /*
+         * Lock the lcn bitmap for writing but only if not rolling back.  We
+         * must hold the lock all the way including through rollback otherwise
+         * rollback is not possible because once we have cleared a bit and
+         * dropped the lock, anyone could have set the bit again, thus
+         * allocating the cluster for another use.
+         */
+        if (likely(!is_rollback))
+                down_write(&vol->lcnbmp_lock);
+        total_freed = real_freed = 0;
+        /* This returns with ni->runlist locked for reading on success. */
+        rl = ntfs_find_vcn(ni, start_vcn, FALSE);
+        if (IS_ERR(rl)) {
+                if (!is_rollback)
+                        ntfs_error(vol->sb, "Failed to find first runlist "
+                                        "element (error %li), aborting.",
+                                        PTR_ERR(rl));
+                err = PTR_ERR(rl);
+                goto err_out;
+        }
+        if (unlikely(rl->lcn < LCN_HOLE)) {
+                if (!is_rollback)
+                        ntfs_error(vol->sb, "First runlist element has "
+                                        "invalid lcn, aborting.");
+                err = -EIO;
+                goto unl_err_out;
+        }
+        /* Find the starting cluster inside the run that needs freeing. */
+        delta = start_vcn - rl->vcn;
+        /* The number of clusters in this run that need freeing. */
+        to_free = rl->length - delta;
+        if (count >= 0 && to_free > count)
+                to_free = count;
+        if (likely(rl->lcn >= 0)) {
+                /* Do the actual freeing of the clusters in this run. */
+                err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta,
+                                to_free, likely(!is_rollback) ? 0 : 1);
+                if (unlikely(err)) {
+                        if (!is_rollback)
+                                ntfs_error(vol->sb, "Failed to clear first run "
+                                                "(error %i), aborting.", err);
+                        goto unl_err_out;
+                }
+                /* We have freed @to_free real clusters. */
+                real_freed = to_free;
+        };
+        /* Go to the next run and adjust the number of clusters left to free. */
+        ++rl;
+        if (count >= 0)
+                count -= to_free;
+        /* Keep track of the total "freed" clusters, including sparse ones. */
+        total_freed = to_free;
+        /*
+         * Loop over the remaining runs, using @count as a capping value, and
+         * free them.
+         */
+        for (; rl->length && count != 0; ++rl) {
+                if (unlikely(rl->lcn < LCN_HOLE)) {
+                        VCN vcn;
+                        /*
+                         * Attempt to map runlist, dropping runlist lock for
+                         * the duration.
+                         */
+                        vcn = rl->vcn;
+                        up_read(&ni->runlist.lock);
+                        err = ntfs_map_runlist(ni, vcn);
+                        if (err) {
+                                if (!is_rollback)
+                                        ntfs_error(vol->sb, "Failed to map "
+                                                        "runlist fragment.");
+                                if (err == -EINVAL || err == -ENOENT)
+                                        err = -EIO;
+                                goto err_out;
+                        }
+                        /*
+                         * This returns with ni->runlist locked for reading on
+                         * success.
+                         */
+                        rl = ntfs_find_vcn(ni, vcn, FALSE);
+                        if (IS_ERR(rl)) {
+                                err = PTR_ERR(rl);
+                                if (!is_rollback)
+                                        ntfs_error(vol->sb, "Failed to find "
+                                                        "subsequent runlist "
+                                                        "element.");
+                                goto err_out;
+                        }
+                        if (unlikely(rl->lcn < LCN_HOLE)) {
+                                if (!is_rollback)
+                                        ntfs_error(vol->sb, "Runlist element "
+                                                        "has invalid lcn "
+                                                        "(0x%llx).",
+                                                        (unsigned long long)
+                                                        rl->lcn);
+                                err = -EIO;
+                                goto unl_err_out;
+                        }
+                }
+                /* The number of clusters in this run that need freeing. */
+                to_free = rl->length;
+                if (count >= 0 && to_free > count)
+                        to_free = count;
+                if (likely(rl->lcn >= 0)) {
+                        /* Do the actual freeing of the clusters in the run. */
+                        err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn,
+                                        to_free, likely(!is_rollback) ? 0 : 1);
+                        if (unlikely(err)) {
+                                if (!is_rollback)
+                                        ntfs_error(vol->sb, "Failed to clear "
+                                                        "subsequent run.");
+                                goto unl_err_out;
+                        }
+                        /* We have freed @to_free real clusters. */
+                        real_freed += to_free;
+                }
+                /* Adjust the number of clusters left to free. */
+                if (count >= 0)
+                        count -= to_free;
+        
+                /* Update the total done clusters. */
+                total_freed += to_free;
+        }
+        up_read(&ni->runlist.lock);
+        if (likely(!is_rollback))
+                up_write(&vol->lcnbmp_lock);
+        BUG_ON(count > 0);
+        /* We are done.  Return the number of actually freed clusters. */
+        ntfs_debug("Done.");
+        return real_freed;
+unl_err_out:
+        up_read(&ni->runlist.lock);
+err_out:
+        if (is_rollback)
+                return err;
+        /* If no real clusters were freed, no need to rollback. */
+        if (!real_freed) {
+                up_write(&vol->lcnbmp_lock);
+                return err;
+        }
+        /*
+         * Attempt to rollback and if that succeeds just return the error code.
+         * If rollback fails, set the volume errors flag, emit an error
+         * message, and return the error code.
+         */
+        delta = __ntfs_cluster_free(vi, start_vcn, total_freed, TRUE);
+        if (delta < 0) {
+                ntfs_error(vol->sb, "Failed to rollback (error %i).  Leaving "
+                                "inconsistent metadata!  Unmount and run "
+                                "chkdsk.", (int)delta);
+                NVolSetErrors(vol);
+        }
+        up_write(&vol->lcnbmp_lock);
+        ntfs_error(vol->sb, "Aborting (error %i).", err);
+        return err;
+}
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
new file mode 100644
index 000000000000..4cac1c024af6
--- /dev/null
+++ b/fs/ntfs/lcnalloc.h
@@ -0,0 +1,112 @@
+/*
+ * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation.  Part of the
+ *              Linux-NTFS project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_LCNALLOC_H
+#define _LINUX_NTFS_LCNALLOC_H
+#ifdef NTFS_RW
+#include <linux/fs.h>
+#include "types.h"
+#include "runlist.h"
+#include "volume.h"
+typedef enum {
+        FIRST_ZONE      = 0,    /* For sanity checking. */
+        MFT_ZONE        = 0,    /* Allocate from $MFT zone. */
+        DATA_ZONE       = 1,    /* Allocate from $DATA zone. */
+        LAST_ZONE       = 1,    /* For sanity checking. */
+} NTFS_CLUSTER_ALLOCATION_ZONES;
+extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
+                const VCN start_vcn, const s64 count, const LCN start_lcn,
+                const NTFS_CLUSTER_ALLOCATION_ZONES zone);
+extern s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
+                s64 count, const BOOL is_rollback);
+/**
+ * ntfs_cluster_free - free clusters on an ntfs volume
+ * @vi:         vfs inode whose runlist describes the clusters to free
+ * @start_vcn:  vcn in the runlist of @vi at which to start freeing clusters
+ * @count:      number of clusters to free or -1 for all clusters
+ *
+ * Free @count clusters starting at the cluster @start_vcn in the runlist
+ * described by the vfs inode @vi.
+ *
+ * If @count is -1, all clusters from @start_vcn to the end of the runlist are
+ * deallocated.  Thus, to completely free all clusters in a runlist, use
+ * @start_vcn = 0 and @count = -1.
+ *
+ * Note, ntfs_cluster_free() does not modify the runlist at all, so the caller
+ * has to deal with it later.
+ *
+ * Return the number of deallocated clusters (not counting sparse ones) on
+ * success and -errno on error.
+ *
+ * Locking: - The runlist described by @vi must be unlocked on entry and is
+ *            unlocked on return.
+ *          - This function takes the runlist lock of @vi for reading and
+ *            sometimes for writing and sometimes modifies the runlist.
+ *          - The volume lcn bitmap must be unlocked on entry and is unlocked
+ *            on return.
+ *          - This function takes the volume lcn bitmap lock for writing and
+ *            modifies the bitmap contents.
+ */
+static inline s64 ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
+                s64 count)
+{
+        return __ntfs_cluster_free(vi, start_vcn, count, FALSE);
+}
+extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
+                const runlist_element *rl);
+/**
+ * ntfs_cluster_free_from_rl - free clusters from runlist
+ * @vol:        mounted ntfs volume on which to free the clusters
+ * @rl:         runlist describing the clusters to free
+ *
+ * Free all the clusters described by the runlist @rl on the volume @vol.  In
+ * the case of an error being returned, at least some of the clusters were not
+ * freed.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: This function takes the volume lcn bitmap lock for writing and
+ *          modifies the bitmap contents.
+ */
+static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol,
+                const runlist_element *rl)
+{
+        int ret;
+        down_write(&vol->lcnbmp_lock);
+        ret = ntfs_cluster_free_from_rl_nolock(vol, rl);
+        up_write(&vol->lcnbmp_lock);
+        return ret;
+}
+#endif /* NTFS_RW */
+#endif /* defined _LINUX_NTFS_LCNALLOC_H */
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
new file mode 100644
index 000000000000..5e280abafab3
--- /dev/null
+++ b/fs/ntfs/logfile.c
@@ -0,0 +1,705 @@
+/*
+ * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2002-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifdef NTFS_RW
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include "attrib.h"
+#include "aops.h"
+#include "debug.h"
+#include "logfile.h"
+#include "malloc.h"
+#include "volume.h"
+#include "ntfs.h"
+/**
+ * ntfs_check_restart_page_header - check the page header for consistency
+ * @vi:         $LogFile inode to which the restart page header belongs
+ * @rp:         restart page header to check
+ * @pos:        position in @vi at which the restart page header resides
+ *
+ * Check the restart page header @rp for consistency and return TRUE if it is
+ * consistent and FALSE otherwise.
+ *
+ * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
+ * require the full restart page.
+ */
+static BOOL ntfs_check_restart_page_header(struct inode *vi,
+                RESTART_PAGE_HEADER *rp, s64 pos)
+{
+        u32 logfile_system_page_size, logfile_log_page_size;
+        u16 usa_count, usa_ofs, usa_end, ra_ofs;
+        ntfs_debug("Entering.");
+        /*
+         * If the system or log page sizes are smaller than the ntfs block size
+         * or either is not a power of 2 we cannot handle this log file.
+         */
+        logfile_system_page_size = le32_to_cpu(rp->system_page_size);
+        logfile_log_page_size = le32_to_cpu(rp->log_page_size);
+        if (logfile_system_page_size < NTFS_BLOCK_SIZE ||
+                        logfile_log_page_size < NTFS_BLOCK_SIZE ||
+                        logfile_system_page_size &
+                        (logfile_system_page_size - 1) ||
+                        logfile_log_page_size & (logfile_log_page_size - 1)) {
+                ntfs_error(vi->i_sb, "$LogFile uses unsupported page size.");
+                return FALSE;
+        }
+        /*
+         * We must be either at !pos (1st restart page) or at pos = system page
+         * size (2nd restart page).
+         */
+        if (pos && pos != logfile_system_page_size) {
+                ntfs_error(vi->i_sb, "Found restart area in incorrect "
+                                "position in $LogFile.");
+                return FALSE;
+        }
+        /* We only know how to handle version 1.1. */
+        if (sle16_to_cpu(rp->major_ver) != 1 ||
+                        sle16_to_cpu(rp->minor_ver) != 1) {
+                ntfs_error(vi->i_sb, "$LogFile version %i.%i is not "
+                                "supported.  (This driver supports version "
+                                "1.1 only.)", (int)sle16_to_cpu(rp->major_ver),
+                                (int)sle16_to_cpu(rp->minor_ver));
+                return FALSE;
+        }
+        /* Verify the size of the update sequence array. */
+        usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS);
+        if (usa_count != le16_to_cpu(rp->usa_count)) {
+                ntfs_error(vi->i_sb, "$LogFile restart page specifies "
+                                "inconsistent update sequence array count.");
+                return FALSE;
+        }
+        /* Verify the position of the update sequence array. */
+        usa_ofs = le16_to_cpu(rp->usa_ofs);
+        usa_end = usa_ofs + usa_count * sizeof(u16);
+        if (usa_ofs < sizeof(RESTART_PAGE_HEADER) ||
+                        usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) {
+                ntfs_error(vi->i_sb, "$LogFile restart page specifies "
+                                "inconsistent update sequence array offset.");
+                return FALSE;
+        }
+        /*
+         * Verify the position of the restart area.  It must be:
+         *      - aligned to 8-byte boundary,
+         *      - after the update sequence array, and
+         *      - within the system page size.
+         */
+        ra_ofs = le16_to_cpu(rp->restart_area_offset);
+        if (ra_ofs & 7 || ra_ofs < usa_end ||
+                        ra_ofs > logfile_system_page_size) {
+                ntfs_error(vi->i_sb, "$LogFile restart page specifies "
+                                "inconsistent restart area offset.");
+                return FALSE;
+        }
+        /*
+         * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn
+         * set.
+         */
+        if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) {
+                ntfs_error(vi->i_sb, "$LogFile restart page is not modified "
+                                "chkdsk but a chkdsk LSN is specified.");
+                return FALSE;
+        }
+        ntfs_debug("Done.");
+        return TRUE;
+}
+/**
+ * ntfs_check_restart_area - check the restart area for consistency
+ * @vi:         $LogFile inode to which the restart page belongs
+ * @rp:         restart page whose restart area to check
+ *
+ * Check the restart area of the restart page @rp for consistency and return
+ * TRUE if it is consistent and FALSE otherwise.
+ *
+ * This function assumes that the restart page header has already been
+ * consistency checked.
+ *
+ * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
+ * require the full restart page.
+ */
+static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
+{
+        u64 file_size;
+        RESTART_AREA *ra;
+        u16 ra_ofs, ra_len, ca_ofs;
+        u8 fs_bits;
+        ntfs_debug("Entering.");
+        ra_ofs = le16_to_cpu(rp->restart_area_offset);
+        ra = (RESTART_AREA*)((u8*)rp + ra_ofs);
+        /*
+         * Everything before ra->file_size must be before the first word
+         * protected by an update sequence number.  This ensures that it is
+         * safe to access ra->client_array_offset.
+         */
+        if (ra_ofs + offsetof(RESTART_AREA, file_size) >
+                        NTFS_BLOCK_SIZE - sizeof(u16)) {
+                ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+                                "inconsistent file offset.");
+                return FALSE;
+        }
+        /*
+         * Now that we can access ra->client_array_offset, make sure everything
+         * up to the log client array is before the first word protected by an
+         * update sequence number.  This ensures we can access all of the
+         * restart area elements safely.  Also, the client array offset must be
+         * aligned to an 8-byte boundary.
+         */
+        ca_ofs = le16_to_cpu(ra->client_array_offset);
+        if (((ca_ofs + 7) & ~7) != ca_ofs ||
+                        ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) {
+                ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+                                "inconsistent client array offset.");
+                return FALSE;
+        }
+        /*
+         * The restart area must end within the system page size both when
+         * calculated manually and as specified by ra->restart_area_length.
+         * Also, the calculated length must not exceed the specified length.
+         */
+        ra_len = ca_ofs + le16_to_cpu(ra->log_clients) *
+                        sizeof(LOG_CLIENT_RECORD);
+        if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) ||
+                        ra_ofs + le16_to_cpu(ra->restart_area_length) >
+                        le32_to_cpu(rp->system_page_size) ||
+                        ra_len > le16_to_cpu(ra->restart_area_length)) {
+                ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds "
+                                "of the system page size specified by the "
+                                "restart page header and/or the specified "
+                                "restart area length is inconsistent.");
+                return FALSE;
+        }
+        /*
+         * The ra->client_free_list and ra->client_in_use_list must be either
+         * LOGFILE_NO_CLIENT or less than ra->log_clients or they are
+         * overflowing the client array.
+         */
+        if ((ra->client_free_list != LOGFILE_NO_CLIENT &&
+                        le16_to_cpu(ra->client_free_list) >=
+                        le16_to_cpu(ra->log_clients)) ||
+                        (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
+                        le16_to_cpu(ra->client_in_use_list) >=
+                        le16_to_cpu(ra->log_clients))) {
+                ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+                                "overflowing client free and/or in use lists.");
+                return FALSE;
+        }
+        /*
+         * Check ra->seq_number_bits against ra->file_size for consistency.
+         * We cannot just use ffs() because the file size is not a power of 2.
+         */
+        file_size = (u64)sle64_to_cpu(ra->file_size);
+        fs_bits = 0;
+        while (file_size) {
+                file_size >>= 1;
+                fs_bits++;
+        }
+        if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) {
+                ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+                                "inconsistent sequence number bits.");
+                return FALSE;
+        }
+        /* The log record header length must be a multiple of 8. */
+        if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) !=
+                        le16_to_cpu(ra->log_record_header_length)) {
+                ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+                                "inconsistent log record header length.");
+                return FALSE;
+        }
+        /* Dito for the log page data offset. */
+        if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) !=
+                        le16_to_cpu(ra->log_page_data_offset)) {
+                ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+                                "inconsistent log page data offset.");
+                return FALSE;
+        }
+        ntfs_debug("Done.");
+        return TRUE;
+}
+/**
+ * ntfs_check_log_client_array - check the log client array for consistency
+ * @vi:         $LogFile inode to which the restart page belongs
+ * @rp:         restart page whose log client array to check
+ *
+ * Check the log client array of the restart page @rp for consistency and
+ * return TRUE if it is consistent and FALSE otherwise.
+ *
+ * This function assumes that the restart page header and the restart area have
+ * already been consistency checked.
+ *
+ * Unlike ntfs_check_restart_page_header() and ntfs_check_restart_area(), this
+ * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full
+ * restart page and the page must be multi sector transfer deprotected.
+ */
+static BOOL ntfs_check_log_client_array(struct inode *vi,
+                RESTART_PAGE_HEADER *rp)
+{
+        RESTART_AREA *ra;
+        LOG_CLIENT_RECORD *ca, *cr;
+        u16 nr_clients, idx;
+        BOOL in_free_list, idx_is_first;
+        ntfs_debug("Entering.");
+        ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
+        ca = (LOG_CLIENT_RECORD*)((u8*)ra +
+                        le16_to_cpu(ra->client_array_offset));
+        /*
+         * Check the ra->client_free_list first and then check the
+         * ra->client_in_use_list.  Check each of the log client records in
+         * each of the lists and check that the array does not overflow the
+         * ra->log_clients value.  Also keep track of the number of records
+         * visited as there cannot be more than ra->log_clients records and
+         * that way we detect eventual loops in within a list.
+         */
+        nr_clients = le16_to_cpu(ra->log_clients);
+        idx = le16_to_cpu(ra->client_free_list);
+        in_free_list = TRUE;
+check_list:
+        for (idx_is_first = TRUE; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--,
+                        idx = le16_to_cpu(cr->next_client)) {
+                if (!nr_clients || idx >= le16_to_cpu(ra->log_clients))
+                        goto err_out;
+                /* Set @cr to the current log client record. */
+                cr = ca + idx;
+                /* The first log client record must not have a prev_client. */
+                if (idx_is_first) {
+                        if (cr->prev_client != LOGFILE_NO_CLIENT)
+                                goto err_out;
+                        idx_is_first = FALSE;
+                }
+        }
+        /* Switch to and check the in use list if we just did the free list. */
+        if (in_free_list) {
+                in_free_list = FALSE;
+                idx = le16_to_cpu(ra->client_in_use_list);
+                goto check_list;
+        }
+        ntfs_debug("Done.");
+        return TRUE;
+err_out:
+        ntfs_error(vi->i_sb, "$LogFile log client array is corrupt.");
+        return FALSE;
+}
+/**
+ * ntfs_check_and_load_restart_page - check the restart page for consistency
+ * @vi:         $LogFile inode to which the restart page belongs
+ * @rp:         restart page to check
+ * @pos:        position in @vi at which the restart page resides
+ * @wrp:        copy of the multi sector transfer deprotected restart page
+ *
+ * Check the restart page @rp for consistency and return TRUE if it is
+ * consistent and FALSE otherwise.
+ *
+ * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
+ * require the full restart page.
+ *
+ * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a
+ * copy of the complete multi sector transfer deprotected page.  On failure,
+ * *@wrp is undefined.
+ */
+static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
+                RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp)
+{
+        RESTART_AREA *ra;
+        RESTART_PAGE_HEADER *trp;
+        int size;
+        BOOL ret;
+        ntfs_debug("Entering.");
+        /* Check the restart page header for consistency. */
+        if (!ntfs_check_restart_page_header(vi, rp, pos)) {
+                /* Error output already done inside the function. */
+                return FALSE;
+        }
+        /* Check the restart area for consistency. */
+        if (!ntfs_check_restart_area(vi, rp)) {
+                /* Error output already done inside the function. */
+                return FALSE;
+        }
+        ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
+        /*
+         * Allocate a buffer to store the whole restart page so we can multi
+         * sector transfer deprotect it.
+         */
+        trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size));
+        if (!trp) {
+                ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile "
+                                "restart page buffer.");
+                return FALSE;
+        }
+        /*
+         * Read the whole of the restart page into the buffer.  If it fits
+         * completely inside @rp, just copy it from there.  Otherwise map all
+         * the required pages and copy the data from them.
+         */
+        size = PAGE_CACHE_SIZE - (pos & ~PAGE_CACHE_MASK);
+        if (size >= le32_to_cpu(rp->system_page_size)) {
+                memcpy(trp, rp, le32_to_cpu(rp->system_page_size));
+        } else {
+                pgoff_t idx;
+                struct page *page;
+                int have_read, to_read;
+                /* First copy what we already have in @rp. */
+                memcpy(trp, rp, size);
+                /* Copy the remaining data one page at a time. */
+                have_read = size;
+                to_read = le32_to_cpu(rp->system_page_size) - size;
+                idx = (pos + size) >> PAGE_CACHE_SHIFT;
+                BUG_ON((pos + size) & ~PAGE_CACHE_MASK);
+                do {
+                        page = ntfs_map_page(vi->i_mapping, idx);
+                        if (IS_ERR(page)) {
+                                ntfs_error(vi->i_sb, "Error mapping $LogFile "
+                                                "page (index %lu).", idx);
+                                goto err_out;
+                        }
+                        size = min_t(int, to_read, PAGE_CACHE_SIZE);
+                        memcpy((u8*)trp + have_read, page_address(page), size);
+                        ntfs_unmap_page(page);
+                        have_read += size;
+                        to_read -= size;
+                        idx++;
+                } while (to_read > 0);
+        }
+        /* Perform the multi sector transfer deprotection on the buffer. */
+        if (post_read_mst_fixup((NTFS_RECORD*)trp,
+                        le32_to_cpu(rp->system_page_size))) {
+                ntfs_error(vi->i_sb, "Multi sector transfer error detected in "
+                                "$LogFile restart page.");
+                goto err_out;
+        }
+        /* Check the log client records for consistency. */
+        ret = ntfs_check_log_client_array(vi, trp);
+        if (ret && wrp)
+                *wrp = trp;
+        else
+                ntfs_free(trp);
+        ntfs_debug("Done.");
+        return ret;
+err_out:
+        ntfs_free(trp);
+        return FALSE;
+}
+/**
+ * ntfs_ckeck_logfile - check in the journal if the volume is consistent
+ * @log_vi:     struct inode of loaded journal $LogFile to check
+ *
+ * Check the $LogFile journal for consistency and return TRUE if it is
+ * consistent and FALSE if not.
+ *
+ * At present we only check the two restart pages and ignore the log record
+ * pages.
+ *
+ * Note that the MstProtected flag is not set on the $LogFile inode and hence
+ * when reading pages they are not deprotected.  This is because we do not know
+ * if the $LogFile was created on a system with a different page size to ours
+ * yet and mst deprotection would fail if our page size is smaller.
+ */
+BOOL ntfs_check_logfile(struct inode *log_vi)
+{
+        s64 size, pos, rstr1_pos, rstr2_pos;
+        ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
+        struct address_space *mapping = log_vi->i_mapping;
+        struct page *page = NULL;
+        u8 *kaddr = NULL;
+        RESTART_PAGE_HEADER *rstr1_ph = NULL;
+        RESTART_PAGE_HEADER *rstr2_ph = NULL;
+        int log_page_size, log_page_mask, ofs;
+        BOOL logfile_is_empty = TRUE;
+        BOOL rstr1_found = FALSE;
+        BOOL rstr2_found = FALSE;
+        u8 log_page_bits;
+        ntfs_debug("Entering.");
+        /* An empty $LogFile must have been clean before it got emptied. */
+        if (NVolLogFileEmpty(vol))
+                goto is_empty;
+        size = log_vi->i_size;
+        /* Make sure the file doesn't exceed the maximum allowed size. */
+        if (size > MaxLogFileSize)
+                size = MaxLogFileSize;
+        /*
+         * Truncate size to a multiple of the page cache size or the default
+         * log page size if the page cache size is between the default log page
+         * log page size if the page cache size is between the default log page
+         * size and twice that.
+         */
+        if (PAGE_CACHE_SIZE >= DefaultLogPageSize && PAGE_CACHE_SIZE <=
+                        DefaultLogPageSize * 2)
+                log_page_size = DefaultLogPageSize;
+        else
+                log_page_size = PAGE_CACHE_SIZE;
+        log_page_mask = log_page_size - 1;
+        /*
+         * Use generic_ffs() instead of ffs() to enable the compiler to
+         * optimize log_page_size and log_page_bits into constants.
+         */
+        log_page_bits = generic_ffs(log_page_size) - 1;
+        size &= ~(log_page_size - 1);
+        /*
+         * Ensure the log file is big enough to store at least the two restart
+         * pages and the minimum number of log record pages.
+         */
+        if (size < log_page_size * 2 || (size - log_page_size * 2) >>
+                        log_page_bits < MinLogRecordPages) {
+                ntfs_error(vol->sb, "$LogFile is too small.");
+                return FALSE;
+        }
+        /*
+         * Read through the file looking for a restart page.  Since the restart
+         * page header is at the beginning of a page we only need to search at
+         * what could be the beginning of a page (for each page size) rather
+         * than scanning the whole file byte by byte.  If all potential places
+         * contain empty and uninitialzed records, the log file can be assumed
+         * to be empty.
+         */
+        for (pos = 0; pos < size; pos <<= 1) {
+                pgoff_t idx = pos >> PAGE_CACHE_SHIFT;
+                if (!page || page->index != idx) {
+                        if (page)
+                                ntfs_unmap_page(page);
+                        page = ntfs_map_page(mapping, idx);
+                        if (IS_ERR(page)) {
+                                ntfs_error(vol->sb, "Error mapping $LogFile "
+                                                "page (index %lu).", idx);
+                                return FALSE;
+                        }
+                }
+                kaddr = (u8*)page_address(page) + (pos & ~PAGE_CACHE_MASK);
+                /*
+                 * A non-empty block means the logfile is not empty while an
+                 * empty block after a non-empty block has been encountered
+                 * means we are done.
+                 */
+                if (!ntfs_is_empty_recordp((le32*)kaddr))
+                        logfile_is_empty = FALSE;
+                else if (!logfile_is_empty)
+                        break;
+                /*
+                 * A log record page means there cannot be a restart page after
+                 * this so no need to continue searching.
+                 */
+                if (ntfs_is_rcrd_recordp((le32*)kaddr))
+                        break;
+                /*
+                 * A modified by chkdsk restart page means we cannot handle
+                 * this log file.
+                 */
+                if (ntfs_is_chkd_recordp((le32*)kaddr)) {
+                        ntfs_error(vol->sb, "$LogFile has been modified by "
+                                        "chkdsk.  Mount this volume in "
+                                        "Windows.");
+                        goto err_out;
+                }
+                /* If not a restart page, continue. */
+                if (!ntfs_is_rstr_recordp((le32*)kaddr)) {
+                        /* Skip to the minimum page size for the next one. */
+                        if (!pos)
+                                pos = NTFS_BLOCK_SIZE >> 1;
+                        continue;
+                }
+                /* We now know we have a restart page. */
+                if (!pos) {
+                        rstr1_found = TRUE;
+                        rstr1_pos = pos;
+                } else {
+                        if (rstr2_found) {
+                                ntfs_error(vol->sb, "Found more than two "
+                                                "restart pages in $LogFile.");
+                                goto err_out;
+                        }
+                        rstr2_found = TRUE;
+                        rstr2_pos = pos;
+                }
+                /*
+                 * Check the restart page for consistency and get a copy of the
+                 * complete multi sector transfer deprotected restart page.
+                 */
+                if (!ntfs_check_and_load_restart_page(log_vi,
+                                (RESTART_PAGE_HEADER*)kaddr, pos,
+                                !pos ? &rstr1_ph : &rstr2_ph)) {
+                        /* Error output already done inside the function. */
+                        goto err_out;
+                }
+                /*
+                 * We have a valid restart page.  The next one must be after
+                 * a whole system page size as specified by the valid restart
+                 * page.
+                 */
+                if (!pos)
+                        pos = le32_to_cpu(rstr1_ph->system_page_size) >> 1;
+        }
+        if (page) {
+                ntfs_unmap_page(page);
+                page = NULL;
+        }
+        if (logfile_is_empty) {
+                NVolSetLogFileEmpty(vol);
+is_empty:
+                ntfs_debug("Done.  ($LogFile is empty.)");
+                return TRUE;
+        }
+        if (!rstr1_found || !rstr2_found) {
+                ntfs_error(vol->sb, "Did not find two restart pages in "
+                                "$LogFile.");
+                goto err_out;
+        }
+        /*
+         * The two restart areas must be identical except for the update
+         * sequence number.
+         */
+        ofs = le16_to_cpu(rstr1_ph->usa_ofs);
+        if (memcmp(rstr1_ph, rstr2_ph, ofs) || (ofs += sizeof(u16),
+                        memcmp((u8*)rstr1_ph + ofs, (u8*)rstr2_ph + ofs,
+                        le32_to_cpu(rstr1_ph->system_page_size) - ofs))) {
+                ntfs_error(vol->sb, "The two restart pages in $LogFile do not "
+                                "match.");
+                goto err_out;
+        }
+        ntfs_free(rstr1_ph);
+        ntfs_free(rstr2_ph);
+        /* All consistency checks passed. */
+        ntfs_debug("Done.");
+        return TRUE;
+err_out:
+        if (page)
+                ntfs_unmap_page(page);
+        if (rstr1_ph)
+                ntfs_free(rstr1_ph);
+        if (rstr2_ph)
+                ntfs_free(rstr2_ph);
+        return FALSE;
+}
+/**
+ * ntfs_is_logfile_clean - check in the journal if the volume is clean
+ * @log_vi:     struct inode of loaded journal $LogFile to check
+ *
+ * Analyze the $LogFile journal and return TRUE if it indicates the volume was
+ * shutdown cleanly and FALSE if not.
+ *
+ * At present we only look at the two restart pages and ignore the log record
+ * pages.  This is a little bit crude in that there will be a very small number
+ * of cases where we think that a volume is dirty when in fact it is clean.
+ * This should only affect volumes that have not been shutdown cleanly but did
+ * not have any pending, non-check-pointed i/o, i.e. they were completely idle
+ * at least for the five seconds preceeding the unclean shutdown.
+ *
+ * This function assumes that the $LogFile journal has already been consistency
+ * checked by a call to ntfs_check_logfile() and in particular if the $LogFile
+ * is empty this function requires that NVolLogFileEmpty() is true otherwise an
+ * empty volume will be reported as dirty.
+ */
+BOOL ntfs_is_logfile_clean(struct inode *log_vi)
+{
+        ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
+        struct page *page;
+        RESTART_PAGE_HEADER *rp;
+        RESTART_AREA *ra;
+        ntfs_debug("Entering.");
+        /* An empty $LogFile must have been clean before it got emptied. */
+        if (NVolLogFileEmpty(vol)) {
+                ntfs_debug("Done.  ($LogFile is empty.)");
+                return TRUE;
+        }
+        /*
+         * Read the first restart page.  It will be possibly incomplete and
+         * will not be multi sector transfer deprotected but we only need the
+         * first NTFS_BLOCK_SIZE bytes so it does not matter.
+         */
+        page = ntfs_map_page(log_vi->i_mapping, 0);
+        if (IS_ERR(page)) {
+                ntfs_error(vol->sb, "Error mapping $LogFile page (index 0).");
+                return FALSE;
+        }
+        rp = (RESTART_PAGE_HEADER*)page_address(page);
+        if (!ntfs_is_rstr_record(rp->magic)) {
+                ntfs_error(vol->sb, "No restart page found at offset zero in "
+                                "$LogFile.  This is probably a bug in that "
+                                "the $LogFile should have been consistency "
+                                "checked before calling this function.");
+                goto err_out;
+        }
+        ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
+        /*
+         * If the $LogFile has active clients, i.e. it is open, and we do not
+         * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags,
+         * we assume there was an unclean shutdown.
+         */
+        if (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
+                        !(ra->flags & RESTART_VOLUME_IS_CLEAN)) {
+                ntfs_debug("Done.  $LogFile indicates a dirty shutdown.");
+                goto err_out;
+        }
+        ntfs_unmap_page(page);
+        /* $LogFile indicates a clean shutdown. */
+        ntfs_debug("Done.  $LogFile indicates a clean shutdown.");
+        return TRUE;
+err_out:
+        ntfs_unmap_page(page);
+        return FALSE;
+}
+/**
+ * ntfs_empty_logfile - empty the contents of the $LogFile journal
+ * @log_vi:     struct inode of loaded journal $LogFile to empty
+ *
+ * Empty the contents of the $LogFile journal @log_vi and return TRUE on
+ * success and FALSE on error.
+ *
+ * This function assumes that the $LogFile journal has already been consistency
+ * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean()
+ * has been used to ensure that the $LogFile is clean.
+ */
+BOOL ntfs_empty_logfile(struct inode *log_vi)
+{
+        ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
+        ntfs_debug("Entering.");
+        if (!NVolLogFileEmpty(vol)) {
+                int err;
+                
+                err = ntfs_attr_set(NTFS_I(log_vi), 0, log_vi->i_size, 0xff);
+                if (unlikely(err)) {
+                        ntfs_error(vol->sb, "Failed to fill $LogFile with "
+                                        "0xff bytes (error code %i).", err);
+                        return FALSE;
+                }
+                /* Set the flag so we do not have to do it again on remount. */
+                NVolSetLogFileEmpty(vol);
+        }
+        ntfs_debug("Done.");
+        return TRUE;
+}
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
new file mode 100644
index 000000000000..4ee4378de061
--- /dev/null
+++ b/fs/ntfs/logfile.h
@@ -0,0 +1,307 @@
+/*
+ * logfile.h - Defines for NTFS kernel journal ($LogFile) handling.  Part of
+ *             the Linux-NTFS project.
+ *
+ * Copyright (c) 2000-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_LOGFILE_H
+#define _LINUX_NTFS_LOGFILE_H
+#ifdef NTFS_RW
+#include <linux/fs.h>
+#include "types.h"
+#include "endian.h"
+#include "layout.h"
+/*
+ * Journal ($LogFile) organization:
+ *
+ * Two restart areas present in the first two pages (restart pages, one restart
+ * area in each page).  When the volume is dismounted they should be identical,
+ * except for the update sequence array which usually has a different update
+ * sequence number.
+ *
+ * These are followed by log records organized in pages headed by a log record
+ * header going up to log file size.  Not all pages contain log records when a
+ * volume is first formatted, but as the volume ages, all records will be used.
+ * When the log file fills up, the records at the beginning are purged (by
+ * modifying the oldest_lsn to a higher value presumably) and writing begins
+ * at the beginning of the file.  Effectively, the log file is viewed as a
+ * circular entity.
+ *
+ * NOTE: Windows NT, 2000, and XP all use log file version 1.1 but they accept
+ * versions <= 1.x, including 0.-1.  (Yes, that is a minus one in there!)  We
+ * probably only want to support 1.1 as this seems to be the current version
+ * and we don't know how that differs from the older versions.  The only
+ * exception is if the journal is clean as marked by the two restart pages
+ * then it doesn't matter whether we are on an earlier version.  We can just
+ * reinitialize the logfile and start again with version 1.1.
+ */
+/* Some $LogFile related constants. */
+#define MaxLogFileSize          0x100000000ULL
+#define DefaultLogPageSize      4096
+#define MinLogRecordPages       48
+/*
+ * Log file restart page header (begins the restart area).
+ */
+typedef struct {
+/*Ofs*/
+/*  0   NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+/*  0*/ NTFS_RECORD_TYPE magic; /* The magic is "RSTR". */
+/*  4*/ le16 usa_ofs;           /* See NTFS_RECORD definition in layout.h.
+                                   When creating, set this to be immediately
+                                   after this header structure (without any
+                                   alignment). */
+/*  6*/ le16 usa_count;         /* See NTFS_RECORD definition in layout.h. */
+/*  8*/ leLSN chkdsk_lsn;       /* The last log file sequence number found by
+                                   chkdsk.  Only used when the magic is changed
+                                   to "CHKD".  Otherwise this is zero. */
+/* 16*/ le32 system_page_size;  /* Byte size of system pages when the log file
+                                   was created, has to be >= 512 and a power of
+                                   2.  Use this to calculate the required size
+                                   of the usa (usa_count) and add it to usa_ofs.
+                                   Then verify that the result is less than the
+                                   value of the restart_area_offset. */
+/* 20*/ le32 log_page_size;     /* Byte size of log file pages, has to be >=
+                                   512 and a power of 2.  The default is 4096
+                                   and is used when the system page size is
+                                   between 4096 and 8192.  Otherwise this is
+                                   set to the system page size instead. */
+/* 24*/ le16 restart_area_offset;/* Byte offset from the start of this header to
+                                   the RESTART_AREA.  Value has to be aligned
+                                   to 8-byte boundary.  When creating, set this
+                                   to be after the usa. */
+/* 26*/ sle16 minor_ver;        /* Log file minor version.  Only check if major
+                                   version is 1. */
+/* 28*/ sle16 major_ver;        /* Log file major version.  We only support
+                                   version 1.1. */
+/* sizeof() = 30 (0x1e) bytes */
+} __attribute__ ((__packed__)) RESTART_PAGE_HEADER;
+/*
+ * Constant for the log client indices meaning that there are no client records
+ * in this particular client array.  Also inside the client records themselves,
+ * this means that there are no client records preceding or following this one.
+ */
+#define LOGFILE_NO_CLIENT       const_cpu_to_le16(0xffff)
+#define LOGFILE_NO_CLIENT_CPU   0xffff
+/*
+ * These are the so far known RESTART_AREA_* flags (16-bit) which contain
+ * information about the log file in which they are present.
+ */
+enum {
+        RESTART_VOLUME_IS_CLEAN = const_cpu_to_le16(0x0002),
+        RESTART_SPACE_FILLER    = 0xffff, /* gcc: Force enum bit width to 16. */
+} __attribute__ ((__packed__));
+typedef le16 RESTART_AREA_FLAGS;
+/*
+ * Log file restart area record.  The offset of this record is found by adding
+ * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found
+ * in it.  See notes at restart_area_offset above.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/ leLSN current_lsn;      /* The current, i.e. last LSN inside the log
+                                   when the restart area was last written.
+                                   This happens often but what is the interval?
+                                   Is it just fixed time or is it every time a
+                                   check point is written or somethine else?
+                                   On create set to 0. */
+/*  8*/ le16 log_clients;       /* Number of log client records in the array of
+                                   log client records which follows this
+                                   restart area.  Must be 1.  */
+/* 10*/ le16 client_free_list;  /* The index of the first free log client record
+                                   in the array of log client records.
+                                   LOGFILE_NO_CLIENT means that there are no
+                                   free log client records in the array.
+                                   If != LOGFILE_NO_CLIENT, check that
+                                   log_clients > client_free_list.  On Win2k
+                                   and presumably earlier, on a clean volume
+                                   this is != LOGFILE_NO_CLIENT, and it should
+                                   be 0, i.e. the first (and only) client
+                                   record is free and thus the logfile is
+                                   closed and hence clean.  A dirty volume
+                                   would have left the logfile open and hence
+                                   this would be LOGFILE_NO_CLIENT.  On WinXP
+                                   and presumably later, the logfile is always
+                                   open, even on clean shutdown so this should
+                                   always be LOGFILE_NO_CLIENT. */
+/* 12*/ le16 client_in_use_list;/* The index of the first in-use log client
+                                   record in the array of log client records.
+                                   LOGFILE_NO_CLIENT means that there are no
+                                   in-use log client records in the array.  If
+                                   != LOGFILE_NO_CLIENT check that log_clients
+                                   > client_in_use_list.  On Win2k and
+                                   presumably earlier, on a clean volume this
+                                   is LOGFILE_NO_CLIENT, i.e. there are no
+                                   client records in use and thus the logfile
+                                   is closed and hence clean.  A dirty volume
+                                   would have left the logfile open and hence
+                                   this would be != LOGFILE_NO_CLIENT, and it
+                                   should be 0, i.e. the first (and only)
+                                   client record is in use.  On WinXP and
+                                   presumably later, the logfile is always
+                                   open, even on clean shutdown so this should
+                                   always be 0. */
+/* 14*/ RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour.  On Win2k
+                                   and presumably earlier this is always 0.  On
+                                   WinXP and presumably later, if the logfile
+                                   was shutdown cleanly, the second bit,
+                                   RESTART_VOLUME_IS_CLEAN, is set.  This bit
+                                   is cleared when the volume is mounted by
+                                   WinXP and set when the volume is dismounted,
+                                   thus if the logfile is dirty, this bit is
+                                   clear.  Thus we don't need to check the
+                                   Windows version to determine if the logfile
+                                   is clean.  Instead if the logfile is closed,
+                                   we know it must be clean.  If it is open and
+                                   this bit is set, we also know it must be
+                                   clean.  If on the other hand the logfile is
+                                   open and this bit is clear, we can be almost
+                                   certain that the logfile is dirty. */
+/* 16*/ le32 seq_number_bits;   /* How many bits to use for the sequence
+                                   number.  This is calculated as 67 - the
+                                   number of bits required to store the logfile
+                                   size in bytes and this can be used in with
+                                   the specified file_size as a consistency
+                                   check. */
+/* 20*/ le16 restart_area_length;/* Length of the restart area including the
+                                   client array.  Following checks required if
+                                   version matches.  Otherwise, skip them.
+                                   restart_area_offset + restart_area_length
+                                   has to be <= system_page_size.  Also,
+                                   restart_area_length has to be >=
+                                   client_array_offset + (log_clients *
+                                   sizeof(log client record)). */
+/* 22*/ le16 client_array_offset;/* Offset from the start of this record to
+                                   the first log client record if versions are
+                                   matched.  When creating, set this to be
+                                   after this restart area structure, aligned
+                                   to 8-bytes boundary.  If the versions do not
+                                   match, this is ignored and the offset is
+                                   assumed to be (sizeof(RESTART_AREA) + 7) &
+                                   ~7, i.e. rounded up to first 8-byte
+                                   boundary.  Either way, client_array_offset
+                                   has to be aligned to an 8-byte boundary.
+                                   Also, restart_area_offset +
+                                   client_array_offset has to be <= 510.
+                                   Finally, client_array_offset + (log_clients
+                                   * sizeof(log client record)) has to be <=
+                                   system_page_size.  On Win2k and presumably
+                                   earlier, this is 0x30, i.e. immediately
+                                   following this record.  On WinXP and
+                                   presumably later, this is 0x40, i.e. there
+                                   are 16 extra bytes between this record and
+                                   the client array.  This probably means that
+                                   the RESTART_AREA record is actually bigger
+                                   in WinXP and later. */
+/* 24*/ sle64 file_size;        /* Usable byte size of the log file.  If the
+                                   restart_area_offset + the offset of the
+                                   file_size are > 510 then corruption has
+                                   occured.  This is the very first check when
+                                   starting with the restart_area as if it
+                                   fails it means that some of the above values
+                                   will be corrupted by the multi sector
+                                   transfer protection.  The file_size has to
+                                   be rounded down to be a multiple of the
+                                   log_page_size in the RESTART_PAGE_HEADER and
+                                   then it has to be at least big enough to
+                                   store the two restart pages and 48 (0x30)
+                                   log record pages. */
+/* 32*/ le32 last_lsn_data_length;/* Length of data of last LSN, not including
+                                   the log record header.  On create set to
+                                   0. */
+/* 36*/ le16 log_record_header_length;/* Byte size of the log record header.
+                                   If the version matches then check that the
+                                   value of log_record_header_length is a
+                                   multiple of 8, i.e.
+                                   (log_record_header_length + 7) & ~7 ==
+                                   log_record_header_length.  When creating set
+                                   it to sizeof(LOG_RECORD_HEADER), aligned to
+                                   8 bytes. */
+/* 38*/ le16 log_page_data_offset;/* Offset to the start of data in a log record
+                                   page.  Must be a multiple of 8.  On create
+                                   set it to immediately after the update
+                                   sequence array of the log record page. */
+/* 40*/ le32 restart_log_open_count;/* A counter that gets incremented every
+                                   time the logfile is restarted which happens
+                                   at mount time when the logfile is opened.
+                                   When creating set to a random value.  Win2k
+                                   sets it to the low 32 bits of the current
+                                   system time in NTFS format (see time.h). */
+/* 44*/ le32 reserved;          /* Reserved/alignment to 8-byte boundary. */
+/* sizeof() = 48 (0x30) bytes */
+} __attribute__ ((__packed__)) RESTART_AREA;
+/*
+ * Log client record.  The offset of this record is found by adding the offset
+ * of the RESTART_AREA to the client_array_offset value found in it.
+ */
+typedef struct {
+/*Ofs*/
+/*  0*/ leLSN oldest_lsn;       /* Oldest LSN needed by this client.  On create
+                                   set to 0. */
+/*  8*/ leLSN client_restart_lsn;/* LSN at which this client needs to restart
+                                   the volume, i.e. the current position within
+                                   the log file.  At present, if clean this
+                                   should = current_lsn in restart area but it
+                                   probably also = current_lsn when dirty most
+                                   of the time.  At create set to 0. */
+/* 16*/ le16 prev_client;       /* The offset to the previous log client record
+                                   in the array of log client records.
+                                   LOGFILE_NO_CLIENT means there is no previous
+                                   client record, i.e. this is the first one.
+                                   This is always LOGFILE_NO_CLIENT. */
+/* 18*/ le16 next_client;       /* The offset to the next log client record in
+                                   the array of log client records.
+                                   LOGFILE_NO_CLIENT means there are no next
+                                   client records, i.e. this is the last one.
+                                   This is always LOGFILE_NO_CLIENT. */
+/* 20*/ le16 seq_number;        /* On Win2k and presumably earlier, this is set
+                                   to zero every time the logfile is restarted
+                                   and it is incremented when the logfile is
+                                   closed at dismount time.  Thus it is 0 when
+                                   dirty and 1 when clean.  On WinXP and
+                                   presumably later, this is always 0. */
+/* 22*/ u8 reserved[6];         /* Reserved/alignment. */
+/* 28*/ le32 client_name_length;/* Length of client name in bytes.  Should
+                                   always be 8. */
+/* 32*/ ntfschar client_name[64];/* Name of the client in Unicode.  Should
+                                   always be "NTFS" with the remaining bytes
+                                   set to 0. */
+/* sizeof() = 160 (0xa0) bytes */
+} __attribute__ ((__packed__)) LOG_CLIENT_RECORD;
+extern BOOL ntfs_check_logfile(struct inode *log_vi);
+extern BOOL ntfs_is_logfile_clean(struct inode *log_vi);
+extern BOOL ntfs_empty_logfile(struct inode *log_vi);
+#endif /* NTFS_RW */
+#endif /* _LINUX_NTFS_LOGFILE_H */
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
new file mode 100644
index 000000000000..fac5944df6d8
--- /dev/null
+++ b/fs/ntfs/malloc.h
@@ -0,0 +1,62 @@
+/*
+ * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_MALLOC_H
+#define _LINUX_NTFS_MALLOC_H
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+/**
+ * ntfs_malloc_nofs - allocate memory in multiples of pages
+ * @size        number of bytes to allocate
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ */
+static inline void *ntfs_malloc_nofs(unsigned long size)
+{
+        if (likely(size <= PAGE_SIZE)) {
+                BUG_ON(!size);
+                /* kmalloc() has per-CPU caches so is faster for now. */
+                return kmalloc(PAGE_SIZE, GFP_NOFS);
+                /* return (void *)__get_free_page(GFP_NOFS | __GFP_HIGHMEM); */
+        }
+        if (likely(size >> PAGE_SHIFT < num_physpages))
+                return __vmalloc(size, GFP_NOFS | __GFP_HIGHMEM, PAGE_KERNEL);
+        return NULL;
+}
+static inline void ntfs_free(void *addr)
+{
+        if (likely(((unsigned long)addr < VMALLOC_START) ||
+                        ((unsigned long)addr >= VMALLOC_END ))) {
+                kfree(addr);
+                /* free_page((unsigned long)addr); */
+                return;
+        }
+        vfree(addr);
+}
+#endif /* _LINUX_NTFS_MALLOC_H */
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
new file mode 100644
index 000000000000..dfa85ac2f8ba
--- /dev/null
+++ b/fs/ntfs/mft.c
@@ -0,0 +1,2829 @@
+/**
+ * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/buffer_head.h>
+#include <linux/swap.h>
+#include "attrib.h"
+#include "aops.h"
+#include "bitmap.h"
+#include "debug.h"
+#include "dir.h"
+#include "lcnalloc.h"
+#include "malloc.h"
+#include "mft.h"
+#include "ntfs.h"
+/**
+ * map_mft_record_page - map the page in which a specific mft record resides
+ * @ni:         ntfs inode whose mft record page to map
+ *
+ * This maps the page in which the mft record of the ntfs inode @ni is situated
+ * and returns a pointer to the mft record within the mapped page.
+ *
+ * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
+ * contains the negative error code returned.
+ */
+static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
+{
+        ntfs_volume *vol = ni->vol;
+        struct inode *mft_vi = vol->mft_ino;
+        struct page *page;
+        unsigned long index, ofs, end_index;
+        BUG_ON(ni->page);
+        /*
+         * The index into the page cache and the offset within the page cache
+         * page of the wanted mft record. FIXME: We need to check for
+         * overflowing the unsigned long, but I don't think we would ever get
+         * here if the volume was that big...
+         */
+        index = ni->mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
+        ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+        /* The maximum valid index into the page cache for $MFT's data. */
+        end_index = mft_vi->i_size >> PAGE_CACHE_SHIFT;
+        /* If the wanted index is out of bounds the mft record doesn't exist. */
+        if (unlikely(index >= end_index)) {
+                if (index > end_index || (mft_vi->i_size & ~PAGE_CACHE_MASK) <
+                                ofs + vol->mft_record_size) {
+                        page = ERR_PTR(-ENOENT);
+                        ntfs_error(vol->sb, "Attemt to read mft record 0x%lx, "
+                                        "which is beyond the end of the mft.  "
+                                        "This is probably a bug in the ntfs "
+                                        "driver.", ni->mft_no);
+                        goto err_out;
+                }
+        }
+        /* Read, map, and pin the page. */
+        page = ntfs_map_page(mft_vi->i_mapping, index);
+        if (likely(!IS_ERR(page))) {
+                /* Catch multi sector transfer fixup errors. */
+                if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) +
+                                ofs)))) {
+                        ni->page = page;
+                        ni->page_ofs = ofs;
+                        return page_address(page) + ofs;
+                }
+                ntfs_error(vol->sb, "Mft record 0x%lx is corrupt.  "
+                                "Run chkdsk.", ni->mft_no);
+                ntfs_unmap_page(page);
+                page = ERR_PTR(-EIO);
+        }
+err_out:
+        ni->page = NULL;
+        ni->page_ofs = 0;
+        return (void*)page;
+}
+/**
+ * map_mft_record - map, pin and lock an mft record
+ * @ni:         ntfs inode whose MFT record to map
+ *
+ * First, take the mrec_lock semaphore. We might now be sleeping, while waiting
+ * for the semaphore if it was already locked by someone else.
+ *
+ * The page of the record is mapped using map_mft_record_page() before being
+ * returned to the caller.
+ *
+ * This in turn uses ntfs_map_page() to get the page containing the wanted mft
+ * record (it in turn calls read_cache_page() which reads it in from disk if
+ * necessary, increments the use count on the page so that it cannot disappear
+ * under us and returns a reference to the page cache page).
+ *
+ * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
+ * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
+ * and the post-read mst fixups on each mft record in the page have been
+ * performed, the page gets PG_uptodate set and PG_locked cleared (this is done
+ * in our asynchronous I/O completion handler end_buffer_read_mft_async()).
+ * ntfs_map_page() waits for PG_locked to become clear and checks if
+ * PG_uptodate is set and returns an error code if not. This provides
+ * sufficient protection against races when reading/using the page.
+ *
+ * However there is the write mapping to think about. Doing the above described
+ * checking here will be fine, because when initiating the write we will set
+ * PG_locked and clear PG_uptodate making sure nobody is touching the page
+ * contents. Doing the locking this way means that the commit to disk code in
+ * the page cache code paths is automatically sufficiently locked with us as
+ * we will not touch a page that has been locked or is not uptodate. The only
+ * locking problem then is them locking the page while we are accessing it.
+ *
+ * So that code will end up having to own the mrec_lock of all mft
+ * records/inodes present in the page before I/O can proceed. In that case we
+ * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
+ * accessing anything without owning the mrec_lock semaphore. But we do need
+ * to use them because of the read_cache_page() invocation and the code becomes
+ * so much simpler this way that it is well worth it.
+ *
+ * The mft record is now ours and we return a pointer to it. You need to check
+ * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
+ * the error code.
+ *
+ * NOTE: Caller is responsible for setting the mft record dirty before calling
+ * unmap_mft_record(). This is obviously only necessary if the caller really
+ * modified the mft record...
+ * Q: Do we want to recycle one of the VFS inode state bits instead?
+ * A: No, the inode ones mean we want to change the mft record, not we want to
+ * write it out.
+ */
+MFT_RECORD *map_mft_record(ntfs_inode *ni)
+{
+        MFT_RECORD *m;
+        ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
+        /* Make sure the ntfs inode doesn't go away. */
+        atomic_inc(&ni->count);
+        /* Serialize access to this mft record. */
+        down(&ni->mrec_lock);
+        m = map_mft_record_page(ni);
+        if (likely(!IS_ERR(m)))
+                return m;
+        up(&ni->mrec_lock);
+        atomic_dec(&ni->count);
+        ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
+        return m;
+}
+/**
+ * unmap_mft_record_page - unmap the page in which a specific mft record resides
+ * @ni:         ntfs inode whose mft record page to unmap
+ *
+ * This unmaps the page in which the mft record of the ntfs inode @ni is
+ * situated and returns. This is a NOOP if highmem is not configured.
+ *
+ * The unmap happens via ntfs_unmap_page() which in turn decrements the use
+ * count on the page thus releasing it from the pinned state.
+ *
+ * We do not actually unmap the page from memory of course, as that will be
+ * done by the page cache code itself when memory pressure increases or
+ * whatever.
+ */
+static inline void unmap_mft_record_page(ntfs_inode *ni)
+{
+        BUG_ON(!ni->page);
+        // TODO: If dirty, blah...
+        ntfs_unmap_page(ni->page);
+        ni->page = NULL;
+        ni->page_ofs = 0;
+        return;
+}
+/**
+ * unmap_mft_record - release a mapped mft record
+ * @ni:         ntfs inode whose MFT record to unmap
+ *
+ * We release the page mapping and the mrec_lock mutex which unmaps the mft
+ * record and releases it for others to get hold of. We also release the ntfs
+ * inode by decrementing the ntfs inode reference count.
+ *
+ * NOTE: If caller has modified the mft record, it is imperative to set the mft
+ * record dirty BEFORE calling unmap_mft_record().
+ */
+void unmap_mft_record(ntfs_inode *ni)
+{
+        struct page *page = ni->page;
+        BUG_ON(!page);
+        ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
+        unmap_mft_record_page(ni);
+        up(&ni->mrec_lock);
+        atomic_dec(&ni->count);
+        /*
+         * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
+         * ntfs_clear_extent_inode() in the extent inode case, and to the
+         * caller in the non-extent, yet pure ntfs inode case, to do the actual
+         * tear down of all structures and freeing of all allocated memory.
+         */
+        return;
+}
+/**
+ * map_extent_mft_record - load an extent inode and attach it to its base
+ * @base_ni:    base ntfs inode
+ * @mref:       mft reference of the extent inode to load
+ * @ntfs_ino:   on successful return, pointer to the ntfs_inode structure
+ *
+ * Load the extent mft record @mref and attach it to its base inode @base_ni.
+ * Return the mapped extent mft record if IS_ERR(result) is false.  Otherwise
+ * PTR_ERR(result) gives the negative error code.
+ *
+ * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
+ * structure of the mapped extent inode.
+ */
+MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
+                ntfs_inode **ntfs_ino)
+{
+        MFT_RECORD *m;
+        ntfs_inode *ni = NULL;
+        ntfs_inode **extent_nis = NULL;
+        int i;
+        unsigned long mft_no = MREF(mref);
+        u16 seq_no = MSEQNO(mref);
+        BOOL destroy_ni = FALSE;
+        ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
+                        mft_no, base_ni->mft_no);
+        /* Make sure the base ntfs inode doesn't go away. */
+        atomic_inc(&base_ni->count);
+        /*
+         * Check if this extent inode has already been added to the base inode,
+         * in which case just return it. If not found, add it to the base
+         * inode before returning it.
+         */
+        down(&base_ni->extent_lock);
+        if (base_ni->nr_extents > 0) {
+                extent_nis = base_ni->ext.extent_ntfs_inos;
+                for (i = 0; i < base_ni->nr_extents; i++) {
+                        if (mft_no != extent_nis[i]->mft_no)
+                                continue;
+                        ni = extent_nis[i];
+                        /* Make sure the ntfs inode doesn't go away. */
+                        atomic_inc(&ni->count);
+                        break;
+                }
+        }
+        if (likely(ni != NULL)) {
+                up(&base_ni->extent_lock);
+                atomic_dec(&base_ni->count);
+                /* We found the record; just have to map and return it. */
+                m = map_mft_record(ni);
+                /* map_mft_record() has incremented this on success. */
+                atomic_dec(&ni->count);
+                if (likely(!IS_ERR(m))) {
+                        /* Verify the sequence number. */
+                        if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
+                                ntfs_debug("Done 1.");
+                                *ntfs_ino = ni;
+                                return m;
+                        }
+                        unmap_mft_record(ni);
+                        ntfs_error(base_ni->vol->sb, "Found stale extent mft "
+                                        "reference! Corrupt file system. "
+                                        "Run chkdsk.");
+                        return ERR_PTR(-EIO);
+                }
+map_err_out:
+                ntfs_error(base_ni->vol->sb, "Failed to map extent "
+                                "mft record, error code %ld.", -PTR_ERR(m));
+                return m;
+        }
+        /* Record wasn't there. Get a new ntfs inode and initialize it. */
+        ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
+        if (unlikely(!ni)) {
+                up(&base_ni->extent_lock);
+                atomic_dec(&base_ni->count);
+                return ERR_PTR(-ENOMEM);
+        }
+        ni->vol = base_ni->vol;
+        ni->seq_no = seq_no;
+        ni->nr_extents = -1;
+        ni->ext.base_ntfs_ino = base_ni;
+        /* Now map the record. */
+        m = map_mft_record(ni);
+        if (IS_ERR(m)) {
+                up(&base_ni->extent_lock);
+                atomic_dec(&base_ni->count);
+                ntfs_clear_extent_inode(ni);
+                goto map_err_out;
+        }
+        /* Verify the sequence number if it is present. */
+        if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
+                ntfs_error(base_ni->vol->sb, "Found stale extent mft "
+                                "reference! Corrupt file system. Run chkdsk.");
+                destroy_ni = TRUE;
+                m = ERR_PTR(-EIO);
+                goto unm_err_out;
+        }
+        /* Attach extent inode to base inode, reallocating memory if needed. */
+        if (!(base_ni->nr_extents & 3)) {
+                ntfs_inode **tmp;
+                int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
+                tmp = (ntfs_inode **)kmalloc(new_size, GFP_NOFS);
+                if (unlikely(!tmp)) {
+                        ntfs_error(base_ni->vol->sb, "Failed to allocate "
+                                        "internal buffer.");
+                        destroy_ni = TRUE;
+                        m = ERR_PTR(-ENOMEM);
+                        goto unm_err_out;
+                }
+                if (base_ni->nr_extents) {
+                        BUG_ON(!base_ni->ext.extent_ntfs_inos);
+                        memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
+                                        4 * sizeof(ntfs_inode *));
+                        kfree(base_ni->ext.extent_ntfs_inos);
+                }
+                base_ni->ext.extent_ntfs_inos = tmp;
+        }
+        base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
+        up(&base_ni->extent_lock);
+        atomic_dec(&base_ni->count);
+        ntfs_debug("Done 2.");
+        *ntfs_ino = ni;
+        return m;
+unm_err_out:
+        unmap_mft_record(ni);
+        up(&base_ni->extent_lock);
+        atomic_dec(&base_ni->count);
+        /*
+         * If the extent inode was not attached to the base inode we need to
+         * release it or we will leak memory.
+         */
+        if (destroy_ni)
+                ntfs_clear_extent_inode(ni);
+        return m;
+}
+#ifdef NTFS_RW
+/**
+ * __mark_mft_record_dirty - set the mft record and the page containing it dirty
+ * @ni:         ntfs inode describing the mapped mft record
+ *
+ * Internal function.  Users should call mark_mft_record_dirty() instead.
+ *
+ * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
+ * as well as the page containing the mft record, dirty.  Also, mark the base
+ * vfs inode dirty.  This ensures that any changes to the mft record are
+ * written out to disk.
+ *
+ * NOTE:  We only set I_DIRTY_SYNC and I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
+ * on the base vfs inode, because even though file data may have been modified,
+ * it is dirty in the inode meta data rather than the data page cache of the
+ * inode, and thus there are no data pages that need writing out.  Therefore, a
+ * full mark_inode_dirty() is overkill.  A mark_inode_dirty_sync(), on the
+ * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to
+ * ensure ->write_inode is called from generic_osync_inode() and this needs to
+ * happen or the file data would not necessarily hit the device synchronously,
+ * even though the vfs inode has the O_SYNC flag set.  Also, I_DIRTY_DATASYNC
+ * simply "feels" better than just I_DIRTY_SYNC, since the file data has not
+ * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own
+ * would suggest.
+ */
+void __mark_mft_record_dirty(ntfs_inode *ni)
+{
+        ntfs_inode *base_ni;
+        ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
+        BUG_ON(NInoAttr(ni));
+        mark_ntfs_record_dirty(ni->page, ni->page_ofs);
+        /* Determine the base vfs inode and mark it dirty, too. */
+        down(&ni->extent_lock);
+        if (likely(ni->nr_extents >= 0))
+                base_ni = ni;
+        else
+                base_ni = ni->ext.base_ntfs_ino;
+        up(&ni->extent_lock);
+        __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+}
+static const char *ntfs_please_email = "Please email "
+                "linux-ntfs-dev@lists.sourceforge.net and say that you saw "
+                "this message.  Thank you.";
+/**
+ * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror
+ * @vol:        ntfs volume on which the mft record to synchronize resides
+ * @mft_no:     mft record number of mft record to synchronize
+ * @m:          mapped, mst protected (extent) mft record to synchronize
+ *
+ * Write the mapped, mst protected (extent) mft record @m with mft record
+ * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol,
+ * bypassing the page cache and the $MFTMirr inode itself.
+ *
+ * This function is only for use at umount time when the mft mirror inode has
+ * already been disposed off.  We BUG() if we are called while the mft mirror
+ * inode is still attached to the volume.
+ *
+ * On success return 0.  On error return -errno.
+ *
+ * NOTE:  This function is not implemented yet as I am not convinced it can
+ * actually be triggered considering the sequence of commits we do in super.c::
+ * ntfs_put_super().  But just in case we provide this place holder as the
+ * alternative would be either to BUG() or to get a NULL pointer dereference
+ * and Oops.
+ */
+static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol,
+                const unsigned long mft_no, MFT_RECORD *m)
+{
+        BUG_ON(vol->mftmirr_ino);
+        ntfs_error(vol->sb, "Umount time mft mirror syncing is not "
+                        "implemented yet.  %s", ntfs_please_email);
+        return -EOPNOTSUPP;
+}
+/**
+ * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror
+ * @vol:        ntfs volume on which the mft record to synchronize resides
+ * @mft_no:     mft record number of mft record to synchronize
+ * @m:          mapped, mst protected (extent) mft record to synchronize
+ * @sync:       if true, wait for i/o completion
+ *
+ * Write the mapped, mst protected (extent) mft record @m with mft record
+ * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol.
+ *
+ * On success return 0.  On error return -errno and set the volume errors flag
+ * in the ntfs volume @vol.
+ *
+ * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
+ *
+ * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
+ * schedule i/o via ->writepage or do it via kntfsd or whatever.
+ */
+int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
+                MFT_RECORD *m, int sync)
+{
+        struct page *page;
+        unsigned int blocksize = vol->sb->s_blocksize;
+        int max_bhs = vol->mft_record_size / blocksize;
+        struct buffer_head *bhs[max_bhs];
+        struct buffer_head *bh, *head;
+        u8 *kmirr;
+        runlist_element *rl;
+        unsigned int block_start, block_end, m_start, m_end, page_ofs;
+        int i_bhs, nr_bhs, err = 0;
+        unsigned char blocksize_bits = vol->mftmirr_ino->i_blkbits;
+        ntfs_debug("Entering for inode 0x%lx.", mft_no);
+        BUG_ON(!max_bhs);
+        if (unlikely(!vol->mftmirr_ino)) {
+                /* This could happen during umount... */
+                err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
+                if (likely(!err))
+                        return err;
+                goto err_out;
+        }
+        /* Get the page containing the mirror copy of the mft record @m. */
+        page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >>
+                        (PAGE_CACHE_SHIFT - vol->mft_record_size_bits));
+        if (IS_ERR(page)) {
+                ntfs_error(vol->sb, "Failed to map mft mirror page.");
+                err = PTR_ERR(page);
+                goto err_out;
+        }
+        lock_page(page);
+        BUG_ON(!PageUptodate(page));
+        ClearPageUptodate(page);
+        /* Offset of the mft mirror record inside the page. */
+        page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+        /* The address in the page of the mirror copy of the mft record @m. */
+        kmirr = page_address(page) + page_ofs;
+        /* Copy the mst protected mft record to the mirror. */
+        memcpy(kmirr, m, vol->mft_record_size);
+        /* Create uptodate buffers if not present. */
+        if (unlikely(!page_has_buffers(page))) {
+                struct buffer_head *tail;
+                bh = head = alloc_page_buffers(page, blocksize, 1);
+                do {
+                        set_buffer_uptodate(bh);
+                        tail = bh;
+                        bh = bh->b_this_page;
+                } while (bh);
+                tail->b_this_page = head;
+                attach_page_buffers(page, head);
+                BUG_ON(!page_has_buffers(page));
+        }
+        bh = head = page_buffers(page);
+        BUG_ON(!bh);
+        rl = NULL;
+        nr_bhs = 0;
+        block_start = 0;
+        m_start = kmirr - (u8*)page_address(page);
+        m_end = m_start + vol->mft_record_size;
+        do {
+                block_end = block_start + blocksize;
+                /* If the buffer is outside the mft record, skip it. */
+                if (block_end <= m_start)
+                        continue;
+                if (unlikely(block_start >= m_end))
+                        break;
+                /* Need to map the buffer if it is not mapped already. */
+                if (unlikely(!buffer_mapped(bh))) {
+                        VCN vcn;
+                        LCN lcn;
+                        unsigned int vcn_ofs;
+                        /* Obtain the vcn and offset of the current block. */
+                        vcn = ((VCN)mft_no << vol->mft_record_size_bits) +
+                                        (block_start - m_start);
+                        vcn_ofs = vcn & vol->cluster_size_mask;
+                        vcn >>= vol->cluster_size_bits;
+                        if (!rl) {
+                                down_read(&NTFS_I(vol->mftmirr_ino)->
+                                                runlist.lock);
+                                rl = NTFS_I(vol->mftmirr_ino)->runlist.rl;
+                                /*
+                                 * $MFTMirr always has the whole of its runlist
+                                 * in memory.
+                                 */
+                                BUG_ON(!rl);
+                        }
+                        /* Seek to element containing target vcn. */
+                        while (rl->length && rl[1].vcn <= vcn)
+                                rl++;
+                        lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+                        /* For $MFTMirr, only lcn >= 0 is a successful remap. */
+                        if (likely(lcn >= 0)) {
+                                /* Setup buffer head to correct block. */
+                                bh->b_blocknr = ((lcn <<
+                                                vol->cluster_size_bits) +
+                                                vcn_ofs) >> blocksize_bits;
+                                set_buffer_mapped(bh);
+                        } else {
+                                bh->b_blocknr = -1;
+                                ntfs_error(vol->sb, "Cannot write mft mirror "
+                                                "record 0x%lx because its "
+                                                "location on disk could not "
+                                                "be determined (error code "
+                                                "%lli).", mft_no,
+                                                (long long)lcn);
+                                err = -EIO;
+                        }
+                }
+                BUG_ON(!buffer_uptodate(bh));
+                BUG_ON(!nr_bhs && (m_start != block_start));
+                BUG_ON(nr_bhs >= max_bhs);
+                bhs[nr_bhs++] = bh;
+                BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
+        } while (block_start = block_end, (bh = bh->b_this_page) != head);
+        if (unlikely(rl))
+                up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock);
+        if (likely(!err)) {
+                /* Lock buffers and start synchronous write i/o on them. */
+                for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
+                        struct buffer_head *tbh = bhs[i_bhs];
+                        if (unlikely(test_set_buffer_locked(tbh)))
+                                BUG();
+                        BUG_ON(!buffer_uptodate(tbh));
+                        clear_buffer_dirty(tbh);
+                        get_bh(tbh);
+                        tbh->b_end_io = end_buffer_write_sync;
+                        submit_bh(WRITE, tbh);
+                }
+                /* Wait on i/o completion of buffers. */
+                for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
+                        struct buffer_head *tbh = bhs[i_bhs];
+                        wait_on_buffer(tbh);
+                        if (unlikely(!buffer_uptodate(tbh))) {
+                                err = -EIO;
+                                /*
+                                 * Set the buffer uptodate so the page and
+                                 * buffer states do not become out of sync.
+                                 */
+                                set_buffer_uptodate(tbh);
+                        }
+                }
+        } else /* if (unlikely(err)) */ {
+                /* Clean the buffers. */
+                for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
+                        clear_buffer_dirty(bhs[i_bhs]);
+        }
+        /* Current state: all buffers are clean, unlocked, and uptodate. */
+        /* Remove the mst protection fixups again. */
+        post_write_mst_fixup((NTFS_RECORD*)kmirr);
+        flush_dcache_page(page);
+        SetPageUptodate(page);
+        unlock_page(page);
+        ntfs_unmap_page(page);
+        if (likely(!err)) {
+                ntfs_debug("Done.");
+        } else {
+                ntfs_error(vol->sb, "I/O error while writing mft mirror "
+                                "record 0x%lx!", mft_no);
+err_out:
+                ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error "
+                                "code %i).  Volume will be left marked dirty "
+                                "on umount.  Run ntfsfix on the partition "
+                                "after umounting to correct this.", -err);
+                NVolSetErrors(vol);
+        }
+        return err;
+}
+/**
+ * write_mft_record_nolock - write out a mapped (extent) mft record
+ * @ni:         ntfs inode describing the mapped (extent) mft record
+ * @m:          mapped (extent) mft record to write
+ * @sync:       if true, wait for i/o completion
+ *
+ * Write the mapped (extent) mft record @m described by the (regular or extent)
+ * ntfs inode @ni to backing store.  If the mft record @m has a counterpart in
+ * the mft mirror, that is also updated.
+ *
+ * We only write the mft record if the ntfs inode @ni is dirty and the first
+ * buffer belonging to its mft record is dirty, too.  We ignore the dirty state
+ * of subsequent buffers because we could have raced with
+ * fs/ntfs/aops.c::mark_ntfs_record_dirty().
+ *
+ * On success, clean the mft record and return 0.  On error, leave the mft
+ * record dirty and return -errno.  The caller should call make_bad_inode() on
+ * the base inode to ensure no more access happens to this inode.  We do not do
+ * it here as the caller may want to finish writing other extent mft records
+ * first to minimize on-disk metadata inconsistencies.
+ *
+ * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
+ * However, if the mft record has a counterpart in the mft mirror and @sync is
+ * true, we write the mft record, wait for i/o completion, and only then write
+ * the mft mirror copy.  This ensures that if the system crashes either the mft
+ * or the mft mirror will contain a self-consistent mft record @m.  If @sync is
+ * false on the other hand, we start i/o on both and then wait for completion
+ * on them.  This provides a speedup but no longer guarantees that you will end
+ * up with a self-consistent mft record in the case of a crash but if you asked
+ * for asynchronous writing you probably do not care about that anyway.
+ *
+ * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
+ * schedule i/o via ->writepage or do it via kntfsd or whatever.
+ */
+int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
+{
+        ntfs_volume *vol = ni->vol;
+        struct page *page = ni->page;
+        unsigned char blocksize_bits = vol->mft_ino->i_blkbits;
+        unsigned int blocksize = 1 << blocksize_bits;
+        int max_bhs = vol->mft_record_size / blocksize;
+        struct buffer_head *bhs[max_bhs];
+        struct buffer_head *bh, *head;
+        runlist_element *rl;
+        unsigned int block_start, block_end, m_start, m_end;
+        int i_bhs, nr_bhs, err = 0;
+        ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
+        BUG_ON(NInoAttr(ni));
+        BUG_ON(!max_bhs);
+        BUG_ON(!PageLocked(page));
+        /*
+         * If the ntfs_inode is clean no need to do anything.  If it is dirty,
+         * mark it as clean now so that it can be redirtied later on if needed.
+         * There is no danger of races since the caller is holding the locks
+         * for the mft record @m and the page it is in.
+         */
+        if (!NInoTestClearDirty(ni))
+                goto done;
+        BUG_ON(!page_has_buffers(page));
+        bh = head = page_buffers(page);
+        BUG_ON(!bh);
+        rl = NULL;
+        nr_bhs = 0;
+        block_start = 0;
+        m_start = ni->page_ofs;
+        m_end = m_start + vol->mft_record_size;
+        do {
+                block_end = block_start + blocksize;
+                /* If the buffer is outside the mft record, skip it. */
+                if (block_end <= m_start)
+                        continue;
+                if (unlikely(block_start >= m_end))
+                        break;
+                /*
+                 * If this block is not the first one in the record, we ignore
+                 * the buffer's dirty state because we could have raced with a
+                 * parallel mark_ntfs_record_dirty().
+                 */
+                if (block_start == m_start) {
+                        /* This block is the first one in the record. */
+                        if (!buffer_dirty(bh)) {
+                                BUG_ON(nr_bhs);
+                                /* Clean records are not written out. */
+                                break;
+                        }
+                }
+                /* Need to map the buffer if it is not mapped already. */
+                if (unlikely(!buffer_mapped(bh))) {
+                        VCN vcn;
+                        LCN lcn;
+                        unsigned int vcn_ofs;
+                        /* Obtain the vcn and offset of the current block. */
+                        vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) +
+                                        (block_start - m_start);
+                        vcn_ofs = vcn & vol->cluster_size_mask;
+                        vcn >>= vol->cluster_size_bits;
+                        if (!rl) {
+                                down_read(&NTFS_I(vol->mft_ino)->runlist.lock);
+                                rl = NTFS_I(vol->mft_ino)->runlist.rl;
+                                BUG_ON(!rl);
+                        }
+                        /* Seek to element containing target vcn. */
+                        while (rl->length && rl[1].vcn <= vcn)
+                                rl++;
+                        lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+                        /* For $MFT, only lcn >= 0 is a successful remap. */
+                        if (likely(lcn >= 0)) {
+                                /* Setup buffer head to correct block. */
+                                bh->b_blocknr = ((lcn <<
+                                                vol->cluster_size_bits) +
+                                                vcn_ofs) >> blocksize_bits;
+                                set_buffer_mapped(bh);
+                        } else {
+                                bh->b_blocknr = -1;
+                                ntfs_error(vol->sb, "Cannot write mft record "
+                                                "0x%lx because its location "
+                                                "on disk could not be "
+                                                "determined (error code %lli).",
+                                                ni->mft_no, (long long)lcn);
+                                err = -EIO;
+                        }
+                }
+                BUG_ON(!buffer_uptodate(bh));
+                BUG_ON(!nr_bhs && (m_start != block_start));
+                BUG_ON(nr_bhs >= max_bhs);
+                bhs[nr_bhs++] = bh;
+                BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
+        } while (block_start = block_end, (bh = bh->b_this_page) != head);
+        if (unlikely(rl))
+                up_read(&NTFS_I(vol->mft_ino)->runlist.lock);
+        if (!nr_bhs)
+                goto done;
+        if (unlikely(err))
+                goto cleanup_out;
+        /* Apply the mst protection fixups. */
+        err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size);
+        if (err) {
+                ntfs_error(vol->sb, "Failed to apply mst fixups!");
+                goto cleanup_out;
+        }
+        flush_dcache_mft_record_page(ni);
+        /* Lock buffers and start synchronous write i/o on them. */
+        for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
+                struct buffer_head *tbh = bhs[i_bhs];
+                if (unlikely(test_set_buffer_locked(tbh)))
+                        BUG();
+                BUG_ON(!buffer_uptodate(tbh));
+                clear_buffer_dirty(tbh);
+                get_bh(tbh);
+                tbh->b_end_io = end_buffer_write_sync;
+                submit_bh(WRITE, tbh);
+        }
+        /* Synchronize the mft mirror now if not @sync. */
+        if (!sync && ni->mft_no < vol->mftmirr_size)
+                ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
+        /* Wait on i/o completion of buffers. */
+        for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
+                struct buffer_head *tbh = bhs[i_bhs];
+                wait_on_buffer(tbh);
+                if (unlikely(!buffer_uptodate(tbh))) {
+                        err = -EIO;
+                        /*
+                         * Set the buffer uptodate so the page and buffer
+                         * states do not become out of sync.
+                         */
+                        if (PageUptodate(page))
+                                set_buffer_uptodate(tbh);
+                }
+        }
+        /* If @sync, now synchronize the mft mirror. */
+        if (sync && ni->mft_no < vol->mftmirr_size)
+                ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
+        /* Remove the mst protection fixups again. */
+        post_write_mst_fixup((NTFS_RECORD*)m);
+        flush_dcache_mft_record_page(ni);
+        if (unlikely(err)) {
+                /* I/O error during writing.  This is really bad! */
+                ntfs_error(vol->sb, "I/O error while writing mft record "
+                                "0x%lx!  Marking base inode as bad.  You "
+                                "should unmount the volume and run chkdsk.",
+                                ni->mft_no);
+                goto err_out;
+        }
+done:
+        ntfs_debug("Done.");
+        return 0;
+cleanup_out:
+        /* Clean the buffers. */
+        for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
+                clear_buffer_dirty(bhs[i_bhs]);
+err_out:
+        /*
+         * Current state: all buffers are clean, unlocked, and uptodate.
+         * The caller should mark the base inode as bad so that no more i/o
+         * happens.  ->clear_inode() will still be invoked so all extent inodes
+         * and other allocated memory will be freed.
+         */
+        if (err == -ENOMEM) {
+                ntfs_error(vol->sb, "Not enough memory to write mft record.  "
+                                "Redirtying so the write is retried later.");
+                mark_mft_record_dirty(ni);
+                err = 0;
+        } else
+                NVolSetErrors(vol);
+        return err;
+}
+/**
+ * ntfs_may_write_mft_record - check if an mft record may be written out
+ * @vol:        [IN]  ntfs volume on which the mft record to check resides
+ * @mft_no:     [IN]  mft record number of the mft record to check
+ * @m:          [IN]  mapped mft record to check
+ * @locked_ni:  [OUT] caller has to unlock this ntfs inode if one is returned
+ *
+ * Check if the mapped (base or extent) mft record @m with mft record number
+ * @mft_no belonging to the ntfs volume @vol may be written out.  If necessary
+ * and possible the ntfs inode of the mft record is locked and the base vfs
+ * inode is pinned.  The locked ntfs inode is then returned in @locked_ni.  The
+ * caller is responsible for unlocking the ntfs inode and unpinning the base
+ * vfs inode.
+ *
+ * Return TRUE if the mft record may be written out and FALSE if not.
+ *
+ * The caller has locked the page and cleared the uptodate flag on it which
+ * means that we can safely write out any dirty mft records that do not have
+ * their inodes in icache as determined by ilookup5() as anyone
+ * opening/creating such an inode would block when attempting to map the mft
+ * record in read_cache_page() until we are finished with the write out.
+ *
+ * Here is a description of the tests we perform:
+ *
+ * If the inode is found in icache we know the mft record must be a base mft
+ * record.  If it is dirty, we do not write it and return FALSE as the vfs
+ * inode write paths will result in the access times being updated which would
+ * cause the base mft record to be redirtied and written out again.  (We know
+ * the access time update will modify the base mft record because Windows
+ * chkdsk complains if the standard information attribute is not in the base
+ * mft record.)
+ *
+ * If the inode is in icache and not dirty, we attempt to lock the mft record
+ * and if we find the lock was already taken, it is not safe to write the mft
+ * record and we return FALSE.
+ *
+ * If we manage to obtain the lock we have exclusive access to the mft record,
+ * which also allows us safe writeout of the mft record.  We then set
+ * @locked_ni to the locked ntfs inode and return TRUE.
+ *
+ * Note we cannot just lock the mft record and sleep while waiting for the lock
+ * because this would deadlock due to lock reversal (normally the mft record is
+ * locked before the page is locked but we already have the page locked here
+ * when we try to lock the mft record).
+ *
+ * If the inode is not in icache we need to perform further checks.
+ *
+ * If the mft record is not a FILE record or it is a base mft record, we can
+ * safely write it and return TRUE.
+ *
+ * We now know the mft record is an extent mft record.  We check if the inode
+ * corresponding to its base mft record is in icache and obtain a reference to
+ * it if it is.  If it is not, we can safely write it and return TRUE.
+ *
+ * We now have the base inode for the extent mft record.  We check if it has an
+ * ntfs inode for the extent mft record attached and if not it is safe to write
+ * the extent mft record and we return TRUE.
+ *
+ * The ntfs inode for the extent mft record is attached to the base inode so we
+ * attempt to lock the extent mft record and if we find the lock was already
+ * taken, it is not safe to write the extent mft record and we return FALSE.
+ *
+ * If we manage to obtain the lock we have exclusive access to the extent mft
+ * record, which also allows us safe writeout of the extent mft record.  We
+ * set the ntfs inode of the extent mft record clean and then set @locked_ni to
+ * the now locked ntfs inode and return TRUE.
+ *
+ * Note, the reason for actually writing dirty mft records here and not just
+ * relying on the vfs inode dirty code paths is that we can have mft records
+ * modified without them ever having actual inodes in memory.  Also we can have
+ * dirty mft records with clean ntfs inodes in memory.  None of the described
+ * cases would result in the dirty mft records being written out if we only
+ * relied on the vfs inode dirty code paths.  And these cases can really occur
+ * during allocation of new mft records and in particular when the
+ * initialized_size of the $MFT/$DATA attribute is extended and the new space
+ * is initialized using ntfs_mft_record_format().  The clean inode can then
+ * appear if the mft record is reused for a new inode before it got written
+ * out.
+ */
+BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
+                const MFT_RECORD *m, ntfs_inode **locked_ni)
+{
+        struct super_block *sb = vol->sb;
+        struct inode *mft_vi = vol->mft_ino;
+        struct inode *vi;
+        ntfs_inode *ni, *eni, **extent_nis;
+        int i;
+        ntfs_attr na;
+        ntfs_debug("Entering for inode 0x%lx.", mft_no);
+        /*
+         * Normally we do not return a locked inode so set @locked_ni to NULL.
+         */
+        BUG_ON(!locked_ni);
+        *locked_ni = NULL;
+        /*
+         * Check if the inode corresponding to this mft record is in the VFS
+         * inode cache and obtain a reference to it if it is.
+         */
+        ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
+        na.mft_no = mft_no;
+        na.name = NULL;
+        na.name_len = 0;
+        na.type = AT_UNUSED;
+        /*
+         * For inode 0, i.e. $MFT itself, we cannot use ilookup5() from here or
+         * we deadlock because the inode is already locked by the kernel
+         * (fs/fs-writeback.c::__sync_single_inode()) and ilookup5() waits
+         * until the inode is unlocked before returning it and it never gets
+         * unlocked because ntfs_should_write_mft_record() never returns.  )-:
+         * Fortunately, we have inode 0 pinned in icache for the duration of
+         * the mount so we can access it directly.
+         */
+        if (!mft_no) {
+                /* Balance the below iput(). */
+                vi = igrab(mft_vi);
+                BUG_ON(vi != mft_vi);
+        } else
+                vi = ilookup5(sb, mft_no, (test_t)ntfs_test_inode, &na);
+        if (vi) {
+                ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
+                /* The inode is in icache. */
+                ni = NTFS_I(vi);
+                /* Take a reference to the ntfs inode. */
+                atomic_inc(&ni->count);
+                /* If the inode is dirty, do not write this record. */
+                if (NInoDirty(ni)) {
+                        ntfs_debug("Inode 0x%lx is dirty, do not write it.",
+                                        mft_no);
+                        atomic_dec(&ni->count);
+                        iput(vi);
+                        return FALSE;
+                }
+                ntfs_debug("Inode 0x%lx is not dirty.", mft_no);
+                /* The inode is not dirty, try to take the mft record lock. */
+                if (unlikely(down_trylock(&ni->mrec_lock))) {
+                        ntfs_debug("Mft record 0x%lx is already locked, do "
+                                        "not write it.", mft_no);
+                        atomic_dec(&ni->count);
+                        iput(vi);
+                        return FALSE;
+                }
+                ntfs_debug("Managed to lock mft record 0x%lx, write it.",
+                                mft_no);
+                /*
+                 * The write has to occur while we hold the mft record lock so
+                 * return the locked ntfs inode.
+                 */
+                *locked_ni = ni;
+                return TRUE;
+        }
+        ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
+        /* The inode is not in icache. */
+        /* Write the record if it is not a mft record (type "FILE"). */
+        if (!ntfs_is_mft_record(m->magic)) {
+                ntfs_debug("Mft record 0x%lx is not a FILE record, write it.",
+                                mft_no);
+                return TRUE;
+        }
+        /* Write the mft record if it is a base inode. */
+        if (!m->base_mft_record) {
+                ntfs_debug("Mft record 0x%lx is a base record, write it.",
+                                mft_no);
+                return TRUE;
+        }
+        /*
+         * This is an extent mft record.  Check if the inode corresponding to
+         * its base mft record is in icache and obtain a reference to it if it
+         * is.
+         */
+        na.mft_no = MREF_LE(m->base_mft_record);
+        ntfs_debug("Mft record 0x%lx is an extent record.  Looking for base "
+                        "inode 0x%lx in icache.", mft_no, na.mft_no);
+        vi = ilookup5(sb, na.mft_no, (test_t)ntfs_test_inode, &na);
+        if (!vi) {
+                /*
+                 * The base inode is not in icache, write this extent mft
+                 * record.
+                 */
+                ntfs_debug("Base inode 0x%lx is not in icache, write the "
+                                "extent record.", na.mft_no);
+                return TRUE;
+        }
+        ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
+        /*
+         * The base inode is in icache.  Check if it has the extent inode
+         * corresponding to this extent mft record attached.
+         */
+        ni = NTFS_I(vi);
+        down(&ni->extent_lock);
+        if (ni->nr_extents <= 0) {
+                /*
+                 * The base inode has no attached extent inodes, write this
+                 * extent mft record.
+                 */
+                up(&ni->extent_lock);
+                iput(vi);
+                ntfs_debug("Base inode 0x%lx has no attached extent inodes, "
+                                "write the extent record.", na.mft_no);
+                return TRUE;
+        }
+        /* Iterate over the attached extent inodes. */
+        extent_nis = ni->ext.extent_ntfs_inos;
+        for (eni = NULL, i = 0; i < ni->nr_extents; ++i) {
+                if (mft_no == extent_nis[i]->mft_no) {
+                        /*
+                         * Found the extent inode corresponding to this extent
+                         * mft record.
+                         */
+                        eni = extent_nis[i];
+                        break;
+                }
+        }
+        /*
+         * If the extent inode was not attached to the base inode, write this
+         * extent mft record.
+         */
+        if (!eni) {
+                up(&ni->extent_lock);
+                iput(vi);
+                ntfs_debug("Extent inode 0x%lx is not attached to its base "
+                                "inode 0x%lx, write the extent record.",
+                                mft_no, na.mft_no);
+                return TRUE;
+        }
+        ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.",
+                        mft_no, na.mft_no);
+        /* Take a reference to the extent ntfs inode. */
+        atomic_inc(&eni->count);
+        up(&ni->extent_lock);
+        /*
+         * Found the extent inode coresponding to this extent mft record.
+         * Try to take the mft record lock.
+         */
+        if (unlikely(down_trylock(&eni->mrec_lock))) {
+                atomic_dec(&eni->count);
+                iput(vi);
+                ntfs_debug("Extent mft record 0x%lx is already locked, do "
+                                "not write it.", mft_no);
+                return FALSE;
+        }
+        ntfs_debug("Managed to lock extent mft record 0x%lx, write it.",
+                        mft_no);
+        if (NInoTestClearDirty(eni))
+                ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.",
+                                mft_no);
+        /*
+         * The write has to occur while we hold the mft record lock so return
+         * the locked extent ntfs inode.
+         */
+        *locked_ni = eni;
+        return TRUE;
+}
+static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
+                "chkdsk.";
+/**
+ * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name
+ * @vol:        volume on which to search for a free mft record
+ * @base_ni:    open base inode if allocating an extent mft record or NULL
+ *
+ * Search for a free mft record in the mft bitmap attribute on the ntfs volume
+ * @vol.
+ *
+ * If @base_ni is NULL start the search at the default allocator position.
+ *
+ * If @base_ni is not NULL start the search at the mft record after the base
+ * mft record @base_ni.
+ *
+ * Return the free mft record on success and -errno on error.  An error code of
+ * -ENOSPC means that there are no free mft records in the currently
+ * initialized mft bitmap.
+ *
+ * Locking: Caller must hold vol->mftbmp_lock for writing.
+ */
+static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
+                ntfs_inode *base_ni)
+{
+        s64 pass_end, ll, data_pos, pass_start, ofs, bit;
+        struct address_space *mftbmp_mapping;
+        u8 *buf, *byte;
+        struct page *page;
+        unsigned int page_ofs, size;
+        u8 pass, b;
+        ntfs_debug("Searching for free mft record in the currently "
+                        "initialized mft bitmap.");
+        mftbmp_mapping = vol->mftbmp_ino->i_mapping;
+        /*
+         * Set the end of the pass making sure we do not overflow the mft
+         * bitmap.
+         */
+        pass_end = NTFS_I(vol->mft_ino)->allocated_size >>
+                        vol->mft_record_size_bits;
+        ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3;
+        if (pass_end > ll)
+                pass_end = ll;
+        pass = 1;
+        if (!base_ni)
+                data_pos = vol->mft_data_pos;
+        else
+                data_pos = base_ni->mft_no + 1;
+        if (data_pos < 24)
+                data_pos = 24;
+        if (data_pos >= pass_end) {
+                data_pos = 24;
+                pass = 2;
+                /* This happens on a freshly formatted volume. */
+                if (data_pos >= pass_end)
+                        return -ENOSPC;
+        }
+        pass_start = data_pos;
+        ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, "
+                        "pass_end 0x%llx, data_pos 0x%llx.", pass,
+                        (long long)pass_start, (long long)pass_end,
+                        (long long)data_pos);
+        /* Loop until a free mft record is found. */
+        for (; pass <= 2;) {
+                /* Cap size to pass_end. */
+                ofs = data_pos >> 3;
+                page_ofs = ofs & ~PAGE_CACHE_MASK;
+                size = PAGE_CACHE_SIZE - page_ofs;
+                ll = ((pass_end + 7) >> 3) - ofs;
+                if (size > ll)
+                        size = ll;
+                size <<= 3;
+                /*
+                 * If we are still within the active pass, search the next page
+                 * for a zero bit.
+                 */
+                if (size) {
+                        page = ntfs_map_page(mftbmp_mapping,
+                                        ofs >> PAGE_CACHE_SHIFT);
+                        if (unlikely(IS_ERR(page))) {
+                                ntfs_error(vol->sb, "Failed to read mft "
+                                                "bitmap, aborting.");
+                                return PTR_ERR(page);
+                        }
+                        buf = (u8*)page_address(page) + page_ofs;
+                        bit = data_pos & 7;
+                        data_pos &= ~7ull;
+                        ntfs_debug("Before inner for loop: size 0x%x, "
+                                        "data_pos 0x%llx, bit 0x%llx", size,
+                                        (long long)data_pos, (long long)bit);
+                        for (; bit < size && data_pos + bit < pass_end;
+                                        bit &= ~7ull, bit += 8) {
+                                byte = buf + (bit >> 3);
+                                if (*byte == 0xff)
+                                        continue;
+                                b = ffz((unsigned long)*byte);
+                                if (b < 8 && b >= (bit & 7)) {
+                                        ll = data_pos + (bit & ~7ull) + b;
+                                        if (unlikely(ll > (1ll << 32))) {
+                                                ntfs_unmap_page(page);
+                                                return -ENOSPC;
+                                        }
+                                        *byte |= 1 << b;
+                                        flush_dcache_page(page);
+                                        set_page_dirty(page);
+                                        ntfs_unmap_page(page);
+                                        ntfs_debug("Done.  (Found and "
+                                                        "allocated mft record "
+                                                        "0x%llx.)",
+                                                        (long long)ll);
+                                        return ll;
+                                }
+                        }
+                        ntfs_debug("After inner for loop: size 0x%x, "
+                                        "data_pos 0x%llx, bit 0x%llx", size,
+                                        (long long)data_pos, (long long)bit);
+                        data_pos += size;
+                        ntfs_unmap_page(page);
+                        /*
+                         * If the end of the pass has not been reached yet,
+                         * continue searching the mft bitmap for a zero bit.
+                         */
+                        if (data_pos < pass_end)
+                                continue;
+                }
+                /* Do the next pass. */
+                if (++pass == 2) {
+                        /*
+                         * Starting the second pass, in which we scan the first
+                         * part of the zone which we omitted earlier.
+                         */
+                        pass_end = pass_start;
+                        data_pos = pass_start = 24;
+                        ntfs_debug("pass %i, pass_start 0x%llx, pass_end "
+                                        "0x%llx.", pass, (long long)pass_start,
+                                        (long long)pass_end);
+                        if (data_pos >= pass_end)
+                                break;
+                }
+        }
+        /* No free mft records in currently initialized mft bitmap. */
+        ntfs_debug("Done.  (No free mft records left in currently initialized "
+                        "mft bitmap.)");
+        return -ENOSPC;
+}
+/**
+ * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster
+ * @vol:        volume on which to extend the mft bitmap attribute
+ *
+ * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster.
+ *
+ * Note: Only changes allocated_size, i.e. does not touch initialized_size or
+ * data_size.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - Caller must hold vol->mftbmp_lock for writing.
+ *          - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for
+ *            writing and releases it before returning.
+ *          - This function takes vol->lcnbmp_lock for writing and releases it
+ *            before returning.
+ */
+static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
+{
+        LCN lcn;
+        s64 ll;
+        struct page *page;
+        ntfs_inode *mft_ni, *mftbmp_ni;
+        runlist_element *rl, *rl2 = NULL;
+        ntfs_attr_search_ctx *ctx = NULL;
+        MFT_RECORD *mrec;
+        ATTR_RECORD *a = NULL;
+        int ret, mp_size;
+        u32 old_alen = 0;
+        u8 *b, tb;
+        struct {
+                u8 added_cluster:1;
+                u8 added_run:1;
+                u8 mp_rebuilt:1;
+        } status = { 0, 0, 0 };
+        ntfs_debug("Extending mft bitmap allocation.");
+        mft_ni = NTFS_I(vol->mft_ino);
+        mftbmp_ni = NTFS_I(vol->mftbmp_ino);
+        /*
+         * Determine the last lcn of the mft bitmap.  The allocated size of the
+         * mft bitmap cannot be zero so we are ok to do this.
+         * ntfs_find_vcn() returns the runlist locked on success.
+         */
+        rl = ntfs_find_vcn(mftbmp_ni, (mftbmp_ni->allocated_size - 1) >>
+                        vol->cluster_size_bits, TRUE);
+        if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+                ntfs_error(vol->sb, "Failed to determine last allocated "
+                                "cluster of mft bitmap attribute.");
+                if (!IS_ERR(rl)) {
+                        up_write(&mftbmp_ni->runlist.lock);
+                        ret = -EIO;
+                } else
+                        ret = PTR_ERR(rl);
+                return ret;
+        }
+        lcn = rl->lcn + rl->length;
+        ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.",
+                        (long long)lcn);
+        /*
+         * Attempt to get the cluster following the last allocated cluster by
+         * hand as it may be in the MFT zone so the allocator would not give it
+         * to us.
+         */
+        ll = lcn >> 3;
+        page = ntfs_map_page(vol->lcnbmp_ino->i_mapping,
+                        ll >> PAGE_CACHE_SHIFT);
+        if (IS_ERR(page)) {
+                up_write(&mftbmp_ni->runlist.lock);
+                ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
+                return PTR_ERR(page);
+        }
+        b = (u8*)page_address(page) + (ll & ~PAGE_CACHE_MASK);
+        tb = 1 << (lcn & 7ull);
+        down_write(&vol->lcnbmp_lock);
+        if (*b != 0xff && !(*b & tb)) {
+                /* Next cluster is free, allocate it. */
+                *b |= tb;
+                flush_dcache_page(page);
+                set_page_dirty(page);
+                up_write(&vol->lcnbmp_lock);
+                ntfs_unmap_page(page);
+                /* Update the mft bitmap runlist. */
+                rl->length++;
+                rl[1].vcn++;
+                status.added_cluster = 1;
+                ntfs_debug("Appending one cluster to mft bitmap.");
+        } else {
+                up_write(&vol->lcnbmp_lock);
+                ntfs_unmap_page(page);
+                /* Allocate a cluster from the DATA_ZONE. */
+                rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE);
+                if (IS_ERR(rl2)) {
+                        up_write(&mftbmp_ni->runlist.lock);
+                        ntfs_error(vol->sb, "Failed to allocate a cluster for "
+                                        "the mft bitmap.");
+                        return PTR_ERR(rl2);
+                }
+                rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2);
+                if (IS_ERR(rl)) {
+                        up_write(&mftbmp_ni->runlist.lock);
+                        ntfs_error(vol->sb, "Failed to merge runlists for mft "
+                                        "bitmap.");
+                        if (ntfs_cluster_free_from_rl(vol, rl2)) {
+                                ntfs_error(vol->sb, "Failed to dealocate "
+                                                "allocated cluster.%s", es);
+                                NVolSetErrors(vol);
+                        }
+                        ntfs_free(rl2);
+                        return PTR_ERR(rl);
+                }
+                mftbmp_ni->runlist.rl = rl;
+                status.added_run = 1;
+                ntfs_debug("Adding one run to mft bitmap.");
+                /* Find the last run in the new runlist. */
+                for (; rl[1].length; rl++)
+                        ;
+        }
+        /*
+         * Update the attribute record as well.  Note: @rl is the last
+         * (non-terminator) runlist element of mft bitmap.
+         */
+        mrec = map_mft_record(mft_ni);
+        if (IS_ERR(mrec)) {
+                ntfs_error(vol->sb, "Failed to map mft record.");
+                ret = PTR_ERR(mrec);
+                goto undo_alloc;
+        }
+        ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+        if (unlikely(!ctx)) {
+                ntfs_error(vol->sb, "Failed to get search context.");
+                ret = -ENOMEM;
+                goto undo_alloc;
+        }
+        ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+                        mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
+                        0, ctx);
+        if (unlikely(ret)) {
+                ntfs_error(vol->sb, "Failed to find last attribute extent of "
+                                "mft bitmap attribute.");
+                if (ret == -ENOENT)
+                        ret = -EIO;
+                goto undo_alloc;
+        }
+        a = ctx->attr;
+        ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+        /* Search back for the previous last allocated cluster of mft bitmap. */
+        for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) {
+                if (ll >= rl2->vcn)
+                        break;
+        }
+        BUG_ON(ll < rl2->vcn);
+        BUG_ON(ll >= rl2->vcn + rl2->length);
+        /* Get the size for the new mapping pairs array for this extent. */
+        mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll);
+        if (unlikely(mp_size <= 0)) {
+                ntfs_error(vol->sb, "Get size for mapping pairs failed for "
+                                "mft bitmap attribute extent.");
+                ret = mp_size;
+                if (!ret)
+                        ret = -EIO;
+                goto undo_alloc;
+        }
+        /* Expand the attribute record if necessary. */
+        old_alen = le32_to_cpu(a->length);
+        ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
+                        le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+        if (unlikely(ret)) {
+                if (ret != -ENOSPC) {
+                        ntfs_error(vol->sb, "Failed to resize attribute "
+                                        "record for mft bitmap attribute.");
+                        goto undo_alloc;
+                }
+                // TODO: Deal with this by moving this extent to a new mft
+                // record or by starting a new extent in a new mft record or by
+                // moving other attributes out of this mft record.
+                ntfs_error(vol->sb, "Not enough space in this mft record to "
+                                "accomodate extended mft bitmap attribute "
+                                "extent.  Cannot handle this yet.");
+                ret = -EOPNOTSUPP;
+                goto undo_alloc;
+        }
+        status.mp_rebuilt = 1;
+        /* Generate the mapping pairs array directly into the attr record. */
+        ret = ntfs_mapping_pairs_build(vol, (u8*)a +
+                        le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+                        mp_size, rl2, ll, NULL);
+        if (unlikely(ret)) {
+                ntfs_error(vol->sb, "Failed to build mapping pairs array for "
+                                "mft bitmap attribute.");
+                goto undo_alloc;
+        }
+        /* Update the highest_vcn. */
+        a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
+        /*
+         * We now have extended the mft bitmap allocated_size by one cluster.
+         * Reflect this in the ntfs_inode structure and the attribute record.
+         */
+        if (a->data.non_resident.lowest_vcn) {
+                /*
+                 * We are not in the first attribute extent, switch to it, but
+                 * first ensure the changes will make it to disk later.
+                 */
+                flush_dcache_mft_record_page(ctx->ntfs_ino);
+                mark_mft_record_dirty(ctx->ntfs_ino);
+                ntfs_attr_reinit_search_ctx(ctx);
+                ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+                                mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL,
+                                0, ctx);
+                if (unlikely(ret)) {
+                        ntfs_error(vol->sb, "Failed to find first attribute "
+                                        "extent of mft bitmap attribute.");
+                        goto restore_undo_alloc;
+                }
+                a = ctx->attr;
+        }
+        mftbmp_ni->allocated_size += vol->cluster_size;
+        a->data.non_resident.allocated_size =
+                        cpu_to_sle64(mftbmp_ni->allocated_size);
+        /* Ensure the changes make it to disk. */
+        flush_dcache_mft_record_page(ctx->ntfs_ino);
+        mark_mft_record_dirty(ctx->ntfs_ino);
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(mft_ni);
+        up_write(&mftbmp_ni->runlist.lock);
+        ntfs_debug("Done.");
+        return 0;
+restore_undo_alloc:
+        ntfs_attr_reinit_search_ctx(ctx);
+        if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+                        mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
+                        0, ctx)) {
+                ntfs_error(vol->sb, "Failed to find last attribute extent of "
+                                "mft bitmap attribute.%s", es);
+                mftbmp_ni->allocated_size += vol->cluster_size;
+                ntfs_attr_put_search_ctx(ctx);
+                unmap_mft_record(mft_ni);
+                up_write(&mftbmp_ni->runlist.lock);
+                /*
+                 * The only thing that is now wrong is ->allocated_size of the
+                 * base attribute extent which chkdsk should be able to fix.
+                 */
+                NVolSetErrors(vol);
+                return ret;
+        }
+        a = ctx->attr;
+        a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2);
+undo_alloc:
+        if (status.added_cluster) {
+                /* Truncate the last run in the runlist by one cluster. */
+                rl->length--;
+                rl[1].vcn--;
+        } else if (status.added_run) {
+                lcn = rl->lcn;
+                /* Remove the last run from the runlist. */
+                rl->lcn = rl[1].lcn;
+                rl->length = 0;
+        }
+        /* Deallocate the cluster. */
+        down_write(&vol->lcnbmp_lock);
+        if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
+                ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es);
+                NVolSetErrors(vol);
+        }
+        up_write(&vol->lcnbmp_lock);
+        if (status.mp_rebuilt) {
+                if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+                                a->data.non_resident.mapping_pairs_offset),
+                                old_alen - le16_to_cpu(
+                                a->data.non_resident.mapping_pairs_offset),
+                                rl2, ll, NULL)) {
+                        ntfs_error(vol->sb, "Failed to restore mapping pairs "
+                                        "array.%s", es);
+                        NVolSetErrors(vol);
+                }
+                if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
+                        ntfs_error(vol->sb, "Failed to restore attribute "
+                                        "record.%s", es);
+                        NVolSetErrors(vol);
+                }
+                flush_dcache_mft_record_page(ctx->ntfs_ino);
+                mark_mft_record_dirty(ctx->ntfs_ino);
+        }
+        if (ctx)
+                ntfs_attr_put_search_ctx(ctx);
+        if (!IS_ERR(mrec))
+                unmap_mft_record(mft_ni);
+        up_write(&mftbmp_ni->runlist.lock);
+        return ret;
+}
+/**
+ * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data
+ * @vol:        volume on which to extend the mft bitmap attribute
+ *
+ * Extend the initialized portion of the mft bitmap attribute on the ntfs
+ * volume @vol by 8 bytes.
+ *
+ * Note:  Only changes initialized_size and data_size, i.e. requires that
+ * allocated_size is big enough to fit the new initialized_size.
+ *
+ * Return 0 on success and -error on error.
+ *
+ * Locking: Caller must hold vol->mftbmp_lock for writing.
+ */
+static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
+{
+        s64 old_data_size, old_initialized_size;
+        struct inode *mftbmp_vi;
+        ntfs_inode *mft_ni, *mftbmp_ni;
+        ntfs_attr_search_ctx *ctx;
+        MFT_RECORD *mrec;
+        ATTR_RECORD *a;
+        int ret;
+        ntfs_debug("Extending mft bitmap initiailized (and data) size.");
+        mft_ni = NTFS_I(vol->mft_ino);
+        mftbmp_vi = vol->mftbmp_ino;
+        mftbmp_ni = NTFS_I(mftbmp_vi);
+        /* Get the attribute record. */
+        mrec = map_mft_record(mft_ni);
+        if (IS_ERR(mrec)) {
+                ntfs_error(vol->sb, "Failed to map mft record.");
+                return PTR_ERR(mrec);
+        }
+        ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+        if (unlikely(!ctx)) {
+                ntfs_error(vol->sb, "Failed to get search context.");
+                ret = -ENOMEM;
+                goto unm_err_out;
+        }
+        ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+                        mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx);
+        if (unlikely(ret)) {
+                ntfs_error(vol->sb, "Failed to find first attribute extent of "
+                                "mft bitmap attribute.");
+                if (ret == -ENOENT)
+                        ret = -EIO;
+                goto put_err_out;
+        }
+        a = ctx->attr;
+        old_data_size = mftbmp_vi->i_size;
+        old_initialized_size = mftbmp_ni->initialized_size;
+        /*
+         * We can simply update the initialized_size before filling the space
+         * with zeroes because the caller is holding the mft bitmap lock for
+         * writing which ensures that no one else is trying to access the data.
+         */
+        mftbmp_ni->initialized_size += 8;
+        a->data.non_resident.initialized_size =
+                        cpu_to_sle64(mftbmp_ni->initialized_size);
+        if (mftbmp_ni->initialized_size > mftbmp_vi->i_size) {
+                mftbmp_vi->i_size = mftbmp_ni->initialized_size;
+                a->data.non_resident.data_size =
+                                cpu_to_sle64(mftbmp_vi->i_size);
+        }
+        /* Ensure the changes make it to disk. */
+        flush_dcache_mft_record_page(ctx->ntfs_ino);
+        mark_mft_record_dirty(ctx->ntfs_ino);
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(mft_ni);
+        /* Initialize the mft bitmap attribute value with zeroes. */
+        ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0);
+        if (likely(!ret)) {
+                ntfs_debug("Done.  (Wrote eight initialized bytes to mft "
+                                "bitmap.");
+                return 0;
+        }
+        ntfs_error(vol->sb, "Failed to write to mft bitmap.");
+        /* Try to recover from the error. */
+        mrec = map_mft_record(mft_ni);
+        if (IS_ERR(mrec)) {
+                ntfs_error(vol->sb, "Failed to map mft record.%s", es);
+                NVolSetErrors(vol);
+                return ret;
+        }
+        ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+        if (unlikely(!ctx)) {
+                ntfs_error(vol->sb, "Failed to get search context.%s", es);
+                NVolSetErrors(vol);
+                goto unm_err_out;
+        }
+        if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+                        mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) {
+                ntfs_error(vol->sb, "Failed to find first attribute extent of "
+                                "mft bitmap attribute.%s", es);
+                NVolSetErrors(vol);
+put_err_out:
+                ntfs_attr_put_search_ctx(ctx);
+unm_err_out:
+                unmap_mft_record(mft_ni);
+                goto err_out;
+        }
+        a = ctx->attr;
+        mftbmp_ni->initialized_size = old_initialized_size;
+        a->data.non_resident.initialized_size =
+                        cpu_to_sle64(old_initialized_size);
+        if (mftbmp_vi->i_size != old_data_size) {
+                mftbmp_vi->i_size = old_data_size;
+                a->data.non_resident.data_size = cpu_to_sle64(old_data_size);
+        }
+        flush_dcache_mft_record_page(ctx->ntfs_ino);
+        mark_mft_record_dirty(ctx->ntfs_ino);
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(mft_ni);
+        ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, "
+                        "data_size 0x%llx, initialized_size 0x%llx.",
+                        (long long)mftbmp_ni->allocated_size,
+                        (long long)mftbmp_vi->i_size,
+                        (long long)mftbmp_ni->initialized_size);
+err_out:
+        return ret;
+}
+/**
+ * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute
+ * @vol:        volume on which to extend the mft data attribute
+ *
+ * Extend the mft data attribute on the ntfs volume @vol by 16 mft records
+ * worth of clusters or if not enough space for this by one mft record worth
+ * of clusters.
+ *
+ * Note:  Only changes allocated_size, i.e. does not touch initialized_size or
+ * data_size.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - Caller must hold vol->mftbmp_lock for writing.
+ *          - This function takes NTFS_I(vol->mft_ino)->runlist.lock for
+ *            writing and releases it before returning.
+ *          - This function calls functions which take vol->lcnbmp_lock for
+ *            writing and release it before returning.
+ */
+static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
+{
+        LCN lcn;
+        VCN old_last_vcn;
+        s64 min_nr, nr, ll = 0;
+        ntfs_inode *mft_ni;
+        runlist_element *rl, *rl2;
+        ntfs_attr_search_ctx *ctx = NULL;
+        MFT_RECORD *mrec;
+        ATTR_RECORD *a = NULL;
+        int ret, mp_size;
+        u32 old_alen = 0;
+        BOOL mp_rebuilt = FALSE;
+        ntfs_debug("Extending mft data allocation.");
+        mft_ni = NTFS_I(vol->mft_ino);
+        /*
+         * Determine the preferred allocation location, i.e. the last lcn of
+         * the mft data attribute.  The allocated size of the mft data
+         * attribute cannot be zero so we are ok to do this.
+         * ntfs_find_vcn() returns the runlist locked on success.
+         */
+        rl = ntfs_find_vcn(mft_ni, (mft_ni->allocated_size - 1) >>
+                        vol->cluster_size_bits, TRUE);
+        if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+                ntfs_error(vol->sb, "Failed to determine last allocated "
+                                "cluster of mft data attribute.");
+                if (!IS_ERR(rl)) {
+                        up_write(&mft_ni->runlist.lock);
+                        ret = -EIO;
+                } else
+                        ret = PTR_ERR(rl);
+                return ret;
+        }
+        lcn = rl->lcn + rl->length;
+        ntfs_debug("Last lcn of mft data attribute is 0x%llx.",
+                        (long long)lcn);
+        /* Minimum allocation is one mft record worth of clusters. */
+        min_nr = vol->mft_record_size >> vol->cluster_size_bits;
+        if (!min_nr)
+                min_nr = 1;
+        /* Want to allocate 16 mft records worth of clusters. */
+        nr = vol->mft_record_size << 4 >> vol->cluster_size_bits;
+        if (!nr)
+                nr = min_nr;
+        /* Ensure we do not go above 2^32-1 mft records. */
+        if (unlikely((mft_ni->allocated_size +
+                        (nr << vol->cluster_size_bits)) >>
+                        vol->mft_record_size_bits >= (1ll << 32))) {
+                nr = min_nr;
+                if (unlikely((mft_ni->allocated_size +
+                                (nr << vol->cluster_size_bits)) >>
+                                vol->mft_record_size_bits >= (1ll << 32))) {
+                        ntfs_warning(vol->sb, "Cannot allocate mft record "
+                                        "because the maximum number of inodes "
+                                        "(2^32) has already been reached.");
+                        up_write(&mft_ni->runlist.lock);
+                        return -ENOSPC;
+                }
+        }
+        ntfs_debug("Trying mft data allocation with %s cluster count %lli.",
+                        nr > min_nr ? "default" : "minimal", (long long)nr);
+        old_last_vcn = rl[1].vcn;
+        do {
+                rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE);
+                if (likely(!IS_ERR(rl2)))
+                        break;
+                if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
+                        ntfs_error(vol->sb, "Failed to allocate the minimal "
+                                        "number of clusters (%lli) for the "
+                                        "mft data attribute.", (long long)nr);
+                        up_write(&mft_ni->runlist.lock);
+                        return PTR_ERR(rl2);
+                }
+                /*
+                 * There is not enough space to do the allocation, but there
+                 * might be enough space to do a minimal allocation so try that
+                 * before failing.
+                 */
+                nr = min_nr;
+                ntfs_debug("Retrying mft data allocation with minimal cluster "
+                                "count %lli.", (long long)nr);
+        } while (1);
+        rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2);
+        if (IS_ERR(rl)) {
+                up_write(&mft_ni->runlist.lock);
+                ntfs_error(vol->sb, "Failed to merge runlists for mft data "
+                                "attribute.");
+                if (ntfs_cluster_free_from_rl(vol, rl2)) {
+                        ntfs_error(vol->sb, "Failed to dealocate clusters "
+                                        "from the mft data attribute.%s", es);
+                        NVolSetErrors(vol);
+                }
+                ntfs_free(rl2);
+                return PTR_ERR(rl);
+        }
+        mft_ni->runlist.rl = rl;
+        ntfs_debug("Allocated %lli clusters.", nr);
+        /* Find the last run in the new runlist. */
+        for (; rl[1].length; rl++)
+                ;
+        /* Update the attribute record as well. */
+        mrec = map_mft_record(mft_ni);
+        if (IS_ERR(mrec)) {
+                ntfs_error(vol->sb, "Failed to map mft record.");
+                ret = PTR_ERR(mrec);
+                goto undo_alloc;
+        }
+        ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+        if (unlikely(!ctx)) {
+                ntfs_error(vol->sb, "Failed to get search context.");
+                ret = -ENOMEM;
+                goto undo_alloc;
+        }
+        ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
+                        CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx);
+        if (unlikely(ret)) {
+                ntfs_error(vol->sb, "Failed to find last attribute extent of "
+                                "mft data attribute.");
+                if (ret == -ENOENT)
+                        ret = -EIO;
+                goto undo_alloc;
+        }
+        a = ctx->attr;
+        ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+        /* Search back for the previous last allocated cluster of mft bitmap. */
+        for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) {
+                if (ll >= rl2->vcn)
+                        break;
+        }
+        BUG_ON(ll < rl2->vcn);
+        BUG_ON(ll >= rl2->vcn + rl2->length);
+        /* Get the size for the new mapping pairs array for this extent. */
+        mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll);
+        if (unlikely(mp_size <= 0)) {
+                ntfs_error(vol->sb, "Get size for mapping pairs failed for "
+                                "mft data attribute extent.");
+                ret = mp_size;
+                if (!ret)
+                        ret = -EIO;
+                goto undo_alloc;
+        }
+        /* Expand the attribute record if necessary. */
+        old_alen = le32_to_cpu(a->length);
+        ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
+                        le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+        if (unlikely(ret)) {
+                if (ret != -ENOSPC) {
+                        ntfs_error(vol->sb, "Failed to resize attribute "
+                                        "record for mft data attribute.");
+                        goto undo_alloc;
+                }
+                // TODO: Deal with this by moving this extent to a new mft
+                // record or by starting a new extent in a new mft record or by
+                // moving other attributes out of this mft record.
+                // Note: Use the special reserved mft records and ensure that
+                // this extent is not required to find the mft record in
+                // question.
+                ntfs_error(vol->sb, "Not enough space in this mft record to "
+                                "accomodate extended mft data attribute "
+                                "extent.  Cannot handle this yet.");
+                ret = -EOPNOTSUPP;
+                goto undo_alloc;
+        }
+        mp_rebuilt = TRUE;
+        /* Generate the mapping pairs array directly into the attr record. */
+        ret = ntfs_mapping_pairs_build(vol, (u8*)a +
+                        le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+                        mp_size, rl2, ll, NULL);
+        if (unlikely(ret)) {
+                ntfs_error(vol->sb, "Failed to build mapping pairs array of "
+                                "mft data attribute.");
+                goto undo_alloc;
+        }
+        /* Update the highest_vcn. */
+        a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
+        /*
+         * We now have extended the mft data allocated_size by nr clusters.
+         * Reflect this in the ntfs_inode structure and the attribute record.
+         * @rl is the last (non-terminator) runlist element of mft data
+         * attribute.
+         */
+        if (a->data.non_resident.lowest_vcn) {
+                /*
+                 * We are not in the first attribute extent, switch to it, but
+                 * first ensure the changes will make it to disk later.
+                 */
+                flush_dcache_mft_record_page(ctx->ntfs_ino);
+                mark_mft_record_dirty(ctx->ntfs_ino);
+                ntfs_attr_reinit_search_ctx(ctx);
+                ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name,
+                                mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0,
+                                ctx);
+                if (unlikely(ret)) {
+                        ntfs_error(vol->sb, "Failed to find first attribute "
+                                        "extent of mft data attribute.");
+                        goto restore_undo_alloc;
+                }
+                a = ctx->attr;
+        }
+        mft_ni->allocated_size += nr << vol->cluster_size_bits;
+        a->data.non_resident.allocated_size =
+                        cpu_to_sle64(mft_ni->allocated_size);
+        /* Ensure the changes make it to disk. */
+        flush_dcache_mft_record_page(ctx->ntfs_ino);
+        mark_mft_record_dirty(ctx->ntfs_ino);
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(mft_ni);
+        up_write(&mft_ni->runlist.lock);
+        ntfs_debug("Done.");
+        return 0;
+restore_undo_alloc:
+        ntfs_attr_reinit_search_ctx(ctx);
+        if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
+                        CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) {
+                ntfs_error(vol->sb, "Failed to find last attribute extent of "
+                                "mft data attribute.%s", es);
+                mft_ni->allocated_size += nr << vol->cluster_size_bits;
+                ntfs_attr_put_search_ctx(ctx);
+                unmap_mft_record(mft_ni);
+                up_write(&mft_ni->runlist.lock);
+                /*
+                 * The only thing that is now wrong is ->allocated_size of the
+                 * base attribute extent which chkdsk should be able to fix.
+                 */
+                NVolSetErrors(vol);
+                return ret;
+        }
+        a = ctx->attr;
+        a->data.non_resident.highest_vcn = cpu_to_sle64(old_last_vcn - 1);
+undo_alloc:
+        if (ntfs_cluster_free(vol->mft_ino, old_last_vcn, -1) < 0) {
+                ntfs_error(vol->sb, "Failed to free clusters from mft data "
+                                "attribute.%s", es);
+                NVolSetErrors(vol);
+        }
+        if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
+                ntfs_error(vol->sb, "Failed to truncate mft data attribute "
+                                "runlist.%s", es);
+                NVolSetErrors(vol);
+        }
+        if (mp_rebuilt) {
+                if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+                                a->data.non_resident.mapping_pairs_offset),
+                                old_alen - le16_to_cpu(
+                                a->data.non_resident.mapping_pairs_offset),
+                                rl2, ll, NULL)) {
+                        ntfs_error(vol->sb, "Failed to restore mapping pairs "
+                                        "array.%s", es);
+                        NVolSetErrors(vol);
+                }
+                if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
+                        ntfs_error(vol->sb, "Failed to restore attribute "
+                                        "record.%s", es);
+                        NVolSetErrors(vol);
+                }
+                flush_dcache_mft_record_page(ctx->ntfs_ino);
+                mark_mft_record_dirty(ctx->ntfs_ino);
+        }
+        if (ctx)
+                ntfs_attr_put_search_ctx(ctx);
+        if (!IS_ERR(mrec))
+                unmap_mft_record(mft_ni);
+        up_write(&mft_ni->runlist.lock);
+        return ret;
+}
+/**
+ * ntfs_mft_record_layout - layout an mft record into a memory buffer
+ * @vol:        volume to which the mft record will belong
+ * @mft_no:     mft reference specifying the mft record number
+ * @m:          destination buffer of size >= @vol->mft_record_size bytes
+ *
+ * Layout an empty, unused mft record with the mft record number @mft_no into
+ * the buffer @m.  The volume @vol is needed because the mft record structure
+ * was modified in NTFS 3.1 so we need to know which volume version this mft
+ * record will be used on.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no,
+                MFT_RECORD *m)
+{
+        ATTR_RECORD *a;
+        ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
+        if (mft_no >= (1ll << 32)) {
+                ntfs_error(vol->sb, "Mft record number 0x%llx exceeds "
+                                "maximum of 2^32.", (long long)mft_no);
+                return -ERANGE;
+        }
+        /* Start by clearing the whole mft record to gives us a clean slate. */
+        memset(m, 0, vol->mft_record_size);
+        /* Aligned to 2-byte boundary. */
+        if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver))
+                m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1);
+        else {
+                m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
+                /*
+                 * Set the NTFS 3.1+ specific fields while we know that the
+                 * volume version is 3.1+.
+                 */
+                m->reserved = 0;
+                m->mft_record_number = cpu_to_le32((u32)mft_no);
+        }
+        m->magic = magic_FILE;
+        if (vol->mft_record_size >= NTFS_BLOCK_SIZE)
+                m->usa_count = cpu_to_le16(vol->mft_record_size /
+                                NTFS_BLOCK_SIZE + 1);
+        else {
+                m->usa_count = cpu_to_le16(1);
+                ntfs_warning(vol->sb, "Sector size is bigger than mft record "
+                                "size.  Setting usa_count to 1.  If chkdsk "
+                                "reports this as corruption, please email "
+                                "linux-ntfs-dev@lists.sourceforge.net stating "
+                                "that you saw this message and that the "
+                                "modified file system created was corrupt.  "
+                                "Thank you.");
+        }
+        /* Set the update sequence number to 1. */
+        *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1);
+        m->lsn = 0;
+        m->sequence_number = cpu_to_le16(1);
+        m->link_count = 0;
+        /*
+         * Place the attributes straight after the update sequence array,
+         * aligned to 8-byte boundary.
+         */
+        m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
+                        (le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
+        m->flags = 0;
+        /*
+         * Using attrs_offset plus eight bytes (for the termination attribute).
+         * attrs_offset is already aligned to 8-byte boundary, so no need to
+         * align again.
+         */
+        m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8);
+        m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
+        m->base_mft_record = 0;
+        m->next_attr_instance = 0;
+        /* Add the termination attribute. */
+        a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
+        a->type = AT_END;
+        a->length = 0;
+        ntfs_debug("Done.");
+        return 0;
+}
+/**
+ * ntfs_mft_record_format - format an mft record on an ntfs volume
+ * @vol:        volume on which to format the mft record
+ * @mft_no:     mft record number to format
+ *
+ * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused
+ * mft record into the appropriate place of the mft data attribute.  This is
+ * used when extending the mft data attribute.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
+{
+        struct inode *mft_vi = vol->mft_ino;
+        struct page *page;
+        MFT_RECORD *m;
+        pgoff_t index, end_index;
+        unsigned int ofs;
+        int err;
+        ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
+        /*
+         * The index into the page cache and the offset within the page cache
+         * page of the wanted mft record.
+         */
+        index = mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
+        ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+        /* The maximum valid index into the page cache for $MFT's data. */
+        end_index = mft_vi->i_size >> PAGE_CACHE_SHIFT;
+        if (unlikely(index >= end_index)) {
+                if (unlikely(index > end_index || ofs + vol->mft_record_size >=
+                                (mft_vi->i_size & ~PAGE_CACHE_MASK))) {
+                        ntfs_error(vol->sb, "Tried to format non-existing mft "
+                                        "record 0x%llx.", (long long)mft_no);
+                        return -ENOENT;
+                }
+        }
+        /* Read, map, and pin the page containing the mft record. */
+        page = ntfs_map_page(mft_vi->i_mapping, index);
+        if (unlikely(IS_ERR(page))) {
+                ntfs_error(vol->sb, "Failed to map page containing mft record "
+                                "to format 0x%llx.", (long long)mft_no);
+                return PTR_ERR(page);
+        }
+        lock_page(page);
+        BUG_ON(!PageUptodate(page));
+        ClearPageUptodate(page);
+        m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
+        err = ntfs_mft_record_layout(vol, mft_no, m);
+        if (unlikely(err)) {
+                ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.",
+                                (long long)mft_no);
+                SetPageUptodate(page);
+                unlock_page(page);
+                ntfs_unmap_page(page);
+                return err;
+        }
+        flush_dcache_page(page);
+        SetPageUptodate(page);
+        unlock_page(page);
+        /*
+         * Make sure the mft record is written out to disk.  We could use
+         * ilookup5() to check if an inode is in icache and so on but this is
+         * unnecessary as ntfs_writepage() will write the dirty record anyway.
+         */
+        mark_ntfs_record_dirty(page, ofs);
+        ntfs_unmap_page(page);
+        ntfs_debug("Done.");
+        return 0;
+}
+/**
+ * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
+ * @vol:        [IN]  volume on which to allocate the mft record
+ * @mode:       [IN]  mode if want a file or directory, i.e. base inode or 0
+ * @base_ni:    [IN]  open base inode if allocating an extent mft record or NULL
+ * @mrec:       [OUT] on successful return this is the mapped mft record
+ *
+ * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol.
+ *
+ * If @base_ni is NULL make the mft record a base mft record, i.e. a file or
+ * direvctory inode, and allocate it at the default allocator position.  In
+ * this case @mode is the file mode as given to us by the caller.  We in
+ * particular use @mode to distinguish whether a file or a directory is being
+ * created (S_IFDIR(mode) and S_IFREG(mode), respectively).
+ *
+ * If @base_ni is not NULL make the allocated mft record an extent record,
+ * allocate it starting at the mft record after the base mft record and attach
+ * the allocated and opened ntfs inode to the base inode @base_ni.  In this
+ * case @mode must be 0 as it is meaningless for extent inodes.
+ *
+ * You need to check the return value with IS_ERR().  If false, the function
+ * was successful and the return value is the now opened ntfs inode of the
+ * allocated mft record.  *@mrec is then set to the allocated, mapped, pinned,
+ * and locked mft record.  If IS_ERR() is true, the function failed and the
+ * error code is obtained from PTR_ERR(return value).  *@mrec is undefined in
+ * this case.
+ *
+ * Allocation strategy:
+ *
+ * To find a free mft record, we scan the mft bitmap for a zero bit.  To
+ * optimize this we start scanning at the place specified by @base_ni or if
+ * @base_ni is NULL we start where we last stopped and we perform wrap around
+ * when we reach the end.  Note, we do not try to allocate mft records below
+ * number 24 because numbers 0 to 15 are the defined system files anyway and 16
+ * to 24 are special in that they are used for storing extension mft records
+ * for the $DATA attribute of $MFT.  This is required to avoid the possibility
+ * of creating a runlist with a circular dependency which once written to disk
+ * can never be read in again.  Windows will only use records 16 to 24 for
+ * normal files if the volume is completely out of space.  We never use them
+ * which means that when the volume is really out of space we cannot create any
+ * more files while Windows can still create up to 8 small files.  We can start
+ * doing this at some later time, it does not matter much for now.
+ *
+ * When scanning the mft bitmap, we only search up to the last allocated mft
+ * record.  If there are no free records left in the range 24 to number of
+ * allocated mft records, then we extend the $MFT/$DATA attribute in order to
+ * create free mft records.  We extend the allocated size of $MFT/$DATA by 16
+ * records at a time or one cluster, if cluster size is above 16kiB.  If there
+ * is not sufficient space to do this, we try to extend by a single mft record
+ * or one cluster, if cluster size is above the mft record size.
+ *
+ * No matter how many mft records we allocate, we initialize only the first
+ * allocated mft record, incrementing mft data size and initialized size
+ * accordingly, open an ntfs_inode for it and return it to the caller, unless
+ * there are less than 24 mft records, in which case we allocate and initialize
+ * mft records until we reach record 24 which we consider as the first free mft
+ * record for use by normal files.
+ *
+ * If during any stage we overflow the initialized data in the mft bitmap, we
+ * extend the initialized size (and data size) by 8 bytes, allocating another
+ * cluster if required.  The bitmap data size has to be at least equal to the
+ * number of mft records in the mft, but it can be bigger, in which case the
+ * superflous bits are padded with zeroes.
+ *
+ * Thus, when we return successfully (IS_ERR() is false), we will have:
+ *      - initialized / extended the mft bitmap if necessary,
+ *      - initialized / extended the mft data if necessary,
+ *      - set the bit corresponding to the mft record being allocated in the
+ *        mft bitmap,
+ *      - opened an ntfs_inode for the allocated mft record, and we will have
+ *      - returned the ntfs_inode as well as the allocated mapped, pinned, and
+ *        locked mft record.
+ *
+ * On error, the volume will be left in a consistent state and no record will
+ * be allocated.  If rolling back a partial operation fails, we may leave some
+ * inconsistent metadata in which case we set NVolErrors() so the volume is
+ * left dirty when unmounted.
+ *
+ * Note, this function cannot make use of most of the normal functions, like
+ * for example for attribute resizing, etc, because when the run list overflows
+ * the base mft record and an attribute list is used, it is very important that
+ * the extension mft records used to store the $DATA attribute of $MFT can be
+ * reached without having to read the information contained inside them, as
+ * this would make it impossible to find them in the first place after the
+ * volume is unmounted.  $MFT/$BITMAP probably does not need to follow this
+ * rule because the bitmap is not essential for finding the mft records, but on
+ * the other hand, handling the bitmap in this special way would make life
+ * easier because otherwise there might be circular invocations of functions
+ * when reading the bitmap.
+ */
+ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
+                ntfs_inode *base_ni, MFT_RECORD **mrec)
+{
+        s64 ll, bit, old_data_initialized, old_data_size;
+        struct inode *vi;
+        struct page *page;
+        ntfs_inode *mft_ni, *mftbmp_ni, *ni;
+        ntfs_attr_search_ctx *ctx;
+        MFT_RECORD *m;
+        ATTR_RECORD *a;
+        pgoff_t index;
+        unsigned int ofs;
+        int err;
+        le16 seq_no, usn;
+        BOOL record_formatted = FALSE;
+        if (base_ni) {
+                ntfs_debug("Entering (allocating an extent mft record for "
+                                "base mft record 0x%llx).",
+                                (long long)base_ni->mft_no);
+                /* @mode and @base_ni are mutually exclusive. */
+                BUG_ON(mode);
+        } else
+                ntfs_debug("Entering (allocating a base mft record).");
+        if (mode) {
+                /* @mode and @base_ni are mutually exclusive. */
+                BUG_ON(base_ni);
+                /* We only support creation of normal files and directories. */
+                if (!S_ISREG(mode) && !S_ISDIR(mode))
+                        return ERR_PTR(-EOPNOTSUPP);
+        }
+        BUG_ON(!mrec);
+        mft_ni = NTFS_I(vol->mft_ino);
+        mftbmp_ni = NTFS_I(vol->mftbmp_ino);
+        down_write(&vol->mftbmp_lock);
+        bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni);
+        if (bit >= 0) {
+                ntfs_debug("Found and allocated free record (#1), bit 0x%llx.",
+                                (long long)bit);
+                goto have_alloc_rec;
+        }
+        if (bit != -ENOSPC) {
+                up_write(&vol->mftbmp_lock);
+                return ERR_PTR(bit);
+        }
+        /*
+         * No free mft records left.  If the mft bitmap already covers more
+         * than the currently used mft records, the next records are all free,
+         * so we can simply allocate the first unused mft record.
+         * Note: We also have to make sure that the mft bitmap at least covers
+         * the first 24 mft records as they are special and whilst they may not
+         * be in use, we do not allocate from them.
+         */
+        ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
+        if (mftbmp_ni->initialized_size << 3 > ll &&
+                        mftbmp_ni->initialized_size > 3) {
+                bit = ll;
+                if (bit < 24)
+                        bit = 24;
+                if (unlikely(bit >= (1ll << 32)))
+                        goto max_err_out;
+                ntfs_debug("Found free record (#2), bit 0x%llx.",
+                                (long long)bit);
+                goto found_free_rec;
+        }
+        /*
+         * The mft bitmap needs to be expanded until it covers the first unused
+         * mft record that we can allocate.
+         * Note: The smallest mft record we allocate is mft record 24.
+         */
+        bit = mftbmp_ni->initialized_size << 3;
+        if (unlikely(bit >= (1ll << 32)))
+                goto max_err_out;
+        ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, "
+                        "data_size 0x%llx, initialized_size 0x%llx.",
+                        (long long)mftbmp_ni->allocated_size,
+                        (long long)vol->mftbmp_ino->i_size,
+                        (long long)mftbmp_ni->initialized_size);
+        if (mftbmp_ni->initialized_size + 8 > mftbmp_ni->allocated_size) {
+                /* Need to extend bitmap by one more cluster. */
+                ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
+                err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
+                if (unlikely(err)) {
+                        up_write(&vol->mftbmp_lock);
+                        goto err_out;
+                }
+                ntfs_debug("Status of mftbmp after allocation extension: "
+                                "allocated_size 0x%llx, data_size 0x%llx, "
+                                "initialized_size 0x%llx.",
+                                (long long)mftbmp_ni->allocated_size,
+                                (long long)vol->mftbmp_ino->i_size,
+                                (long long)mftbmp_ni->initialized_size);
+        }
+        /*
+         * We now have sufficient allocated space, extend the initialized_size
+         * as well as the data_size if necessary and fill the new space with
+         * zeroes.
+         */
+        err = ntfs_mft_bitmap_extend_initialized_nolock(vol);
+        if (unlikely(err)) {
+                up_write(&vol->mftbmp_lock);
+                goto err_out;
+        }
+        ntfs_debug("Status of mftbmp after initialized extention: "
+                        "allocated_size 0x%llx, data_size 0x%llx, "
+                        "initialized_size 0x%llx.",
+                        (long long)mftbmp_ni->allocated_size,
+                        (long long)vol->mftbmp_ino->i_size,
+                        (long long)mftbmp_ni->initialized_size);
+        ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
+found_free_rec:
+        /* @bit is the found free mft record, allocate it in the mft bitmap. */
+        ntfs_debug("At found_free_rec.");
+        err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit);
+        if (unlikely(err)) {
+                ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap.");
+                up_write(&vol->mftbmp_lock);
+                goto err_out;
+        }
+        ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit);
+have_alloc_rec:
+        /*
+         * The mft bitmap is now uptodate.  Deal with mft data attribute now.
+         * Note, we keep hold of the mft bitmap lock for writing until all
+         * modifications to the mft data attribute are complete, too, as they
+         * will impact decisions for mft bitmap and mft record allocation done
+         * by a parallel allocation and if the lock is not maintained a
+         * parallel allocation could allocate the same mft record as this one.
+         */
+        ll = (bit + 1) << vol->mft_record_size_bits;
+        if (ll <= mft_ni->initialized_size) {
+                ntfs_debug("Allocated mft record already initialized.");
+                goto mft_rec_already_initialized;
+        }
+        ntfs_debug("Initializing allocated mft record.");
+        /*
+         * The mft record is outside the initialized data.  Extend the mft data
+         * attribute until it covers the allocated record.  The loop is only
+         * actually traversed more than once when a freshly formatted volume is
+         * first written to so it optimizes away nicely in the common case.
+         */
+        ntfs_debug("Status of mft data before extension: "
+                        "allocated_size 0x%llx, data_size 0x%llx, "
+                        "initialized_size 0x%llx.",
+                        (long long)mft_ni->allocated_size,
+                        (long long)vol->mft_ino->i_size,
+                        (long long)mft_ni->initialized_size);
+        while (ll > mft_ni->allocated_size) {
+                err = ntfs_mft_data_extend_allocation_nolock(vol);
+                if (unlikely(err)) {
+                        ntfs_error(vol->sb, "Failed to extend mft data "
+                                        "allocation.");
+                        goto undo_mftbmp_alloc_nolock;
+                }
+                ntfs_debug("Status of mft data after allocation extension: "
+                                "allocated_size 0x%llx, data_size 0x%llx, "
+                                "initialized_size 0x%llx.",
+                                (long long)mft_ni->allocated_size,
+                                (long long)vol->mft_ino->i_size,
+                                (long long)mft_ni->initialized_size);
+        }
+        /*
+         * Extend mft data initialized size (and data size of course) to reach
+         * the allocated mft record, formatting the mft records allong the way.
+         * Note: We only modify the ntfs_inode structure as that is all that is
+         * needed by ntfs_mft_record_format().  We will update the attribute
+         * record itself in one fell swoop later on.
+         */
+        old_data_initialized = mft_ni->initialized_size;
+        old_data_size = vol->mft_ino->i_size;
+        while (ll > mft_ni->initialized_size) {
+                s64 new_initialized_size, mft_no;
+                
+                new_initialized_size = mft_ni->initialized_size +
+                                vol->mft_record_size;
+                mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
+                if (new_initialized_size > vol->mft_ino->i_size)
+                        vol->mft_ino->i_size = new_initialized_size;
+                ntfs_debug("Initializing mft record 0x%llx.",
+                                (long long)mft_no);
+                err = ntfs_mft_record_format(vol, mft_no);
+                if (unlikely(err)) {
+                        ntfs_error(vol->sb, "Failed to format mft record.");
+                        goto undo_data_init;
+                }
+                mft_ni->initialized_size = new_initialized_size;
+        }
+        record_formatted = TRUE;
+        /* Update the mft data attribute record to reflect the new sizes. */
+        m = map_mft_record(mft_ni);
+        if (IS_ERR(m)) {
+                ntfs_error(vol->sb, "Failed to map mft record.");
+                err = PTR_ERR(m);
+                goto undo_data_init;
+        }
+        ctx = ntfs_attr_get_search_ctx(mft_ni, m);
+        if (unlikely(!ctx)) {
+                ntfs_error(vol->sb, "Failed to get search context.");
+                err = -ENOMEM;
+                unmap_mft_record(mft_ni);
+                goto undo_data_init;
+        }
+        err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
+                        CASE_SENSITIVE, 0, NULL, 0, ctx);
+        if (unlikely(err)) {
+                ntfs_error(vol->sb, "Failed to find first attribute extent of "
+                                "mft data attribute.");
+                ntfs_attr_put_search_ctx(ctx);
+                unmap_mft_record(mft_ni);
+                goto undo_data_init;
+        }
+        a = ctx->attr;
+        a->data.non_resident.initialized_size =
+                        cpu_to_sle64(mft_ni->initialized_size);
+        a->data.non_resident.data_size = cpu_to_sle64(vol->mft_ino->i_size);
+        /* Ensure the changes make it to disk. */
+        flush_dcache_mft_record_page(ctx->ntfs_ino);
+        mark_mft_record_dirty(ctx->ntfs_ino);
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(mft_ni);
+        ntfs_debug("Status of mft data after mft record initialization: "
+                        "allocated_size 0x%llx, data_size 0x%llx, "
+                        "initialized_size 0x%llx.",
+                        (long long)mft_ni->allocated_size,
+                        (long long)vol->mft_ino->i_size,
+                        (long long)mft_ni->initialized_size);
+        BUG_ON(vol->mft_ino->i_size > mft_ni->allocated_size);
+        BUG_ON(mft_ni->initialized_size > vol->mft_ino->i_size);
+mft_rec_already_initialized:
+        /*
+         * We can finally drop the mft bitmap lock as the mft data attribute
+         * has been fully updated.  The only disparity left is that the
+         * allocated mft record still needs to be marked as in use to match the
+         * set bit in the mft bitmap but this is actually not a problem since
+         * this mft record is not referenced from anywhere yet and the fact
+         * that it is allocated in the mft bitmap means that no-one will try to
+         * allocate it either.
+         */
+        up_write(&vol->mftbmp_lock);
+        /*
+         * We now have allocated and initialized the mft record.  Calculate the
+         * index of and the offset within the page cache page the record is in.
+         */
+        index = bit << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
+        ofs = (bit << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+        /* Read, map, and pin the page containing the mft record. */
+        page = ntfs_map_page(vol->mft_ino->i_mapping, index);
+        if (unlikely(IS_ERR(page))) {
+                ntfs_error(vol->sb, "Failed to map page containing allocated "
+                                "mft record 0x%llx.", (long long)bit);
+                err = PTR_ERR(page);
+                goto undo_mftbmp_alloc;
+        }
+        lock_page(page);
+        BUG_ON(!PageUptodate(page));
+        ClearPageUptodate(page);
+        m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
+        /* If we just formatted the mft record no need to do it again. */
+        if (!record_formatted) {
+                /* Sanity check that the mft record is really not in use. */
+                if (ntfs_is_file_record(m->magic) &&
+                                (m->flags & MFT_RECORD_IN_USE)) {
+                        ntfs_error(vol->sb, "Mft record 0x%llx was marked "
+                                        "free in mft bitmap but is marked "
+                                        "used itself.  Corrupt filesystem.  "
+                                        "Unmount and run chkdsk.",
+                                        (long long)bit);
+                        err = -EIO;
+                        SetPageUptodate(page);
+                        unlock_page(page);
+                        ntfs_unmap_page(page);
+                        NVolSetErrors(vol);
+                        goto undo_mftbmp_alloc;
+                }
+                /*
+                 * We need to (re-)format the mft record, preserving the
+                 * sequence number if it is not zero as well as the update
+                 * sequence number if it is not zero or -1 (0xffff).  This
+                 * means we do not need to care whether or not something went
+                 * wrong with the previous mft record.
+                 */
+                seq_no = m->sequence_number;
+                usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs));
+                err = ntfs_mft_record_layout(vol, bit, m);
+                if (unlikely(err)) {
+                        ntfs_error(vol->sb, "Failed to layout allocated mft "
+                                        "record 0x%llx.", (long long)bit);
+                        SetPageUptodate(page);
+                        unlock_page(page);
+                        ntfs_unmap_page(page);
+                        goto undo_mftbmp_alloc;
+                }
+                if (seq_no)
+                        m->sequence_number = seq_no;
+                if (usn && le16_to_cpu(usn) != 0xffff)
+                        *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn;
+        }
+        /* Set the mft record itself in use. */
+        m->flags |= MFT_RECORD_IN_USE;
+        if (S_ISDIR(mode))
+                m->flags |= MFT_RECORD_IS_DIRECTORY;
+        flush_dcache_page(page);
+        SetPageUptodate(page);
+        if (base_ni) {
+                /*
+                 * Setup the base mft record in the extent mft record.  This
+                 * completes initialization of the allocated extent mft record
+                 * and we can simply use it with map_extent_mft_record().
+                 */
+                m->base_mft_record = MK_LE_MREF(base_ni->mft_no,
+                                base_ni->seq_no);
+                /*
+                 * Allocate an extent inode structure for the new mft record,
+                 * attach it to the base inode @base_ni and map, pin, and lock
+                 * its, i.e. the allocated, mft record.
+                 */
+                m = map_extent_mft_record(base_ni, bit, &ni);
+                if (IS_ERR(m)) {
+                        ntfs_error(vol->sb, "Failed to map allocated extent "
+                                        "mft record 0x%llx.", (long long)bit);
+                        err = PTR_ERR(m);
+                        /* Set the mft record itself not in use. */
+                        m->flags &= cpu_to_le16(
+                                        ~le16_to_cpu(MFT_RECORD_IN_USE));
+                        flush_dcache_page(page);
+                        /* Make sure the mft record is written out to disk. */
+                        mark_ntfs_record_dirty(page, ofs);
+                        unlock_page(page);
+                        ntfs_unmap_page(page);
+                        goto undo_mftbmp_alloc;
+                }
+                /*
+                 * Make sure the allocated mft record is written out to disk.
+                 * No need to set the inode dirty because the caller is going
+                 * to do that anyway after finishing with the new extent mft
+                 * record (e.g. at a minimum a new attribute will be added to
+                 * the mft record.
+                 */
+                mark_ntfs_record_dirty(page, ofs);
+                unlock_page(page);
+                /*
+                 * Need to unmap the page since map_extent_mft_record() mapped
+                 * it as well so we have it mapped twice at the moment.
+                 */
+                ntfs_unmap_page(page);
+        } else {
+                /*
+                 * Allocate a new VFS inode and set it up.  NOTE: @vi->i_nlink
+                 * is set to 1 but the mft record->link_count is 0.  The caller
+                 * needs to bear this in mind.
+                 */
+                vi = new_inode(vol->sb);
+                if (unlikely(!vi)) {
+                        err = -ENOMEM;
+                        /* Set the mft record itself not in use. */
+                        m->flags &= cpu_to_le16(
+                                        ~le16_to_cpu(MFT_RECORD_IN_USE));
+                        flush_dcache_page(page);
+                        /* Make sure the mft record is written out to disk. */
+                        mark_ntfs_record_dirty(page, ofs);
+                        unlock_page(page);
+                        ntfs_unmap_page(page);
+                        goto undo_mftbmp_alloc;
+                }
+                vi->i_ino = bit;
+                /*
+                 * This is the optimal IO size (for stat), not the fs block
+                 * size.
+                 */
+                vi->i_blksize = PAGE_CACHE_SIZE;
+                /*
+                 * This is for checking whether an inode has changed w.r.t. a
+                 * file so that the file can be updated if necessary (compare
+                 * with f_version).
+                 */
+                vi->i_version = 1;
+                /* The owner and group come from the ntfs volume. */
+                vi->i_uid = vol->uid;
+                vi->i_gid = vol->gid;
+                /* Initialize the ntfs specific part of @vi. */
+                ntfs_init_big_inode(vi);
+                ni = NTFS_I(vi);
+                /*
+                 * Set the appropriate mode, attribute type, and name.  For
+                 * directories, also setup the index values to the defaults.
+                 */
+                if (S_ISDIR(mode)) {
+                        vi->i_mode = S_IFDIR | S_IRWXUGO;
+                        vi->i_mode &= ~vol->dmask;
+                        NInoSetMstProtected(ni);
+                        ni->type = AT_INDEX_ALLOCATION;
+                        ni->name = I30;
+                        ni->name_len = 4;
+                        ni->itype.index.block_size = 4096;
+                        ni->itype.index.block_size_bits = generic_ffs(4096) - 1;
+                        ni->itype.index.collation_rule = COLLATION_FILE_NAME;
+                        if (vol->cluster_size <= ni->itype.index.block_size) {
+                                ni->itype.index.vcn_size = vol->cluster_size;
+                                ni->itype.index.vcn_size_bits =
+                                                vol->cluster_size_bits;
+                        } else {
+                                ni->itype.index.vcn_size = vol->sector_size;
+                                ni->itype.index.vcn_size_bits =
+                                                vol->sector_size_bits;
+                        }
+                } else {
+                        vi->i_mode = S_IFREG | S_IRWXUGO;
+                        vi->i_mode &= ~vol->fmask;
+                        ni->type = AT_DATA;
+                        ni->name = NULL;
+                        ni->name_len = 0;
+                }
+                if (IS_RDONLY(vi))
+                        vi->i_mode &= ~S_IWUGO;
+                /* Set the inode times to the current time. */
+                vi->i_atime = vi->i_mtime = vi->i_ctime =
+                        current_fs_time(vi->i_sb);
+                /*
+                 * Set the file size to 0, the ntfs inode sizes are set to 0 by
+                 * the call to ntfs_init_big_inode() below.
+                 */
+                vi->i_size = 0;
+                vi->i_blocks = 0;
+                /* Set the sequence number. */
+                vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
+                /*
+                 * Manually map, pin, and lock the mft record as we already
+                 * have its page mapped and it is very easy to do.
+                 */
+                atomic_inc(&ni->count);
+                down(&ni->mrec_lock);
+                ni->page = page;
+                ni->page_ofs = ofs;
+                /*
+                 * Make sure the allocated mft record is written out to disk.
+                 * NOTE: We do not set the ntfs inode dirty because this would
+                 * fail in ntfs_write_inode() because the inode does not have a
+                 * standard information attribute yet.  Also, there is no need
+                 * to set the inode dirty because the caller is going to do
+                 * that anyway after finishing with the new mft record (e.g. at
+                 * a minimum some new attributes will be added to the mft
+                 * record.
+                 */
+                mark_ntfs_record_dirty(page, ofs);
+                unlock_page(page);
+                /* Add the inode to the inode hash for the superblock. */
+                insert_inode_hash(vi);
+                /* Update the default mft allocation position. */
+                vol->mft_data_pos = bit + 1;
+        }
+        /*
+         * Return the opened, allocated inode of the allocated mft record as
+         * well as the mapped, pinned, and locked mft record.
+         */
+        ntfs_debug("Returning opened, allocated %sinode 0x%llx.",
+                        base_ni ? "extent " : "", (long long)bit);
+        *mrec = m;
+        return ni;
+undo_data_init:
+        mft_ni->initialized_size = old_data_initialized;
+        vol->mft_ino->i_size = old_data_size;
+        goto undo_mftbmp_alloc_nolock;
+undo_mftbmp_alloc:
+        down_write(&vol->mftbmp_lock);
+undo_mftbmp_alloc_nolock:
+        if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) {
+                ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
+                NVolSetErrors(vol);
+        }
+        up_write(&vol->mftbmp_lock);
+err_out:
+        return ERR_PTR(err);
+max_err_out:
+        ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum "
+                        "number of inodes (2^32) has already been reached.");
+        up_write(&vol->mftbmp_lock);
+        return ERR_PTR(-ENOSPC);
+}
+/**
+ * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume
+ * @ni:         ntfs inode of the mapped extent mft record to free
+ * @m:          mapped extent mft record of the ntfs inode @ni
+ *
+ * Free the mapped extent mft record @m of the extent ntfs inode @ni.
+ *
+ * Note that this function unmaps the mft record and closes and destroys @ni
+ * internally and hence you cannot use either @ni nor @m any more after this
+ * function returns success.
+ *
+ * On success return 0 and on error return -errno.  @ni and @m are still valid
+ * in this case and have not been freed.
+ *
+ * For some errors an error message is displayed and the success code 0 is
+ * returned and the volume is then left dirty on umount.  This makes sense in
+ * case we could not rollback the changes that were already done since the
+ * caller no longer wants to reference this mft record so it does not matter to
+ * the caller if something is wrong with it as long as it is properly detached
+ * from the base inode.
+ */
+int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
+{
+        unsigned long mft_no = ni->mft_no;
+        ntfs_volume *vol = ni->vol;
+        ntfs_inode *base_ni;
+        ntfs_inode **extent_nis;
+        int i, err;
+        le16 old_seq_no;
+        u16 seq_no;
+        
+        BUG_ON(NInoAttr(ni));
+        BUG_ON(ni->nr_extents != -1);
+        down(&ni->extent_lock);
+        base_ni = ni->ext.base_ntfs_ino;
+        up(&ni->extent_lock);
+        BUG_ON(base_ni->nr_extents <= 0);
+        ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n",
+                        mft_no, base_ni->mft_no);
+        down(&base_ni->extent_lock);
+        /* Make sure we are holding the only reference to the extent inode. */
+        if (atomic_read(&ni->count) > 2) {
+                ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, "
+                                "not freeing.", base_ni->mft_no);
+                up(&base_ni->extent_lock);
+                return -EBUSY;
+        }
+        /* Dissociate the ntfs inode from the base inode. */
+        extent_nis = base_ni->ext.extent_ntfs_inos;
+        err = -ENOENT;
+        for (i = 0; i < base_ni->nr_extents; i++) {
+                if (ni != extent_nis[i])
+                        continue;
+                extent_nis += i;
+                base_ni->nr_extents--;
+                memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) *
+                                sizeof(ntfs_inode*));
+                err = 0;
+                break;
+        }
+        up(&base_ni->extent_lock);
+        if (unlikely(err)) {
+                ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to "
+                                "its base inode 0x%lx.", mft_no,
+                                base_ni->mft_no);
+                BUG();
+        }
+        /*
+         * The extent inode is no longer attached to the base inode so no one
+         * can get a reference to it any more.
+         */
+        /* Mark the mft record as not in use. */
+        m->flags &= const_cpu_to_le16(~const_le16_to_cpu(MFT_RECORD_IN_USE));
+        /* Increment the sequence number, skipping zero, if it is not zero. */
+        old_seq_no = m->sequence_number;
+        seq_no = le16_to_cpu(old_seq_no);
+        if (seq_no == 0xffff)
+                seq_no = 1;
+        else if (seq_no)
+                seq_no++;
+        m->sequence_number = cpu_to_le16(seq_no);
+        /*
+         * Set the ntfs inode dirty and write it out.  We do not need to worry
+         * about the base inode here since whatever caused the extent mft
+         * record to be freed is guaranteed to do it already.
+         */
+        NInoSetDirty(ni);
+        err = write_mft_record(ni, m, 0);
+        if (unlikely(err)) {
+                ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not "
+                                "freeing.", mft_no);
+                goto rollback;
+        }
+rollback_error:
+        /* Unmap and throw away the now freed extent inode. */
+        unmap_extent_mft_record(ni);
+        ntfs_clear_extent_inode(ni);
+        /* Clear the bit in the $MFT/$BITMAP corresponding to this record. */
+        down_write(&vol->mftbmp_lock);
+        err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no);
+        up_write(&vol->mftbmp_lock);
+        if (unlikely(err)) {
+                /*
+                 * The extent inode is gone but we failed to deallocate it in
+                 * the mft bitmap.  Just emit a warning and leave the volume
+                 * dirty on umount.
+                 */
+                ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
+                NVolSetErrors(vol);
+        }
+        return 0;
+rollback:
+        /* Rollback what we did... */
+        down(&base_ni->extent_lock);
+        extent_nis = base_ni->ext.extent_ntfs_inos;
+        if (!(base_ni->nr_extents & 3)) {
+                int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*);
+                extent_nis = (ntfs_inode**)kmalloc(new_size, GFP_NOFS);
+                if (unlikely(!extent_nis)) {
+                        ntfs_error(vol->sb, "Failed to allocate internal "
+                                        "buffer during rollback.%s", es);
+                        up(&base_ni->extent_lock);
+                        NVolSetErrors(vol);
+                        goto rollback_error;
+                }
+                if (base_ni->nr_extents) {
+                        BUG_ON(!base_ni->ext.extent_ntfs_inos);
+                        memcpy(extent_nis, base_ni->ext.extent_ntfs_inos,
+                                        new_size - 4 * sizeof(ntfs_inode*));
+                        kfree(base_ni->ext.extent_ntfs_inos);
+                }
+                base_ni->ext.extent_ntfs_inos = extent_nis;
+        }
+        m->flags |= MFT_RECORD_IN_USE;
+        m->sequence_number = old_seq_no;
+        extent_nis[base_ni->nr_extents++] = ni;
+        up(&base_ni->extent_lock);
+        mark_mft_record_dirty(ni);
+        return err;
+}
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h
new file mode 100644
index 000000000000..407de2cef1d6
--- /dev/null
+++ b/fs/ntfs/mft.h
@@ -0,0 +1,127 @@
+/*
+ * mft.h - Defines for mft record handling in NTFS Linux kernel driver.
+ *         Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_MFT_H
+#define _LINUX_NTFS_MFT_H
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include "inode.h"
+extern MFT_RECORD *map_mft_record(ntfs_inode *ni);
+extern void unmap_mft_record(ntfs_inode *ni);
+extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
+                ntfs_inode **ntfs_ino);
+static inline void unmap_extent_mft_record(ntfs_inode *ni)
+{
+        unmap_mft_record(ni);
+        return;
+}
+#ifdef NTFS_RW
+/**
+ * flush_dcache_mft_record_page - flush_dcache_page() for mft records
+ * @ni:         ntfs inode structure of mft record
+ *
+ * Call flush_dcache_page() for the page in which an mft record resides.
+ *
+ * This must be called every time an mft record is modified, just after the
+ * modification.
+ */
+static inline void flush_dcache_mft_record_page(ntfs_inode *ni)
+{
+        flush_dcache_page(ni->page);
+}
+extern void __mark_mft_record_dirty(ntfs_inode *ni);
+/**
+ * mark_mft_record_dirty - set the mft record and the page containing it dirty
+ * @ni:         ntfs inode describing the mapped mft record
+ *
+ * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
+ * as well as the page containing the mft record, dirty.  Also, mark the base
+ * vfs inode dirty.  This ensures that any changes to the mft record are
+ * written out to disk.
+ *
+ * NOTE:  Do not do anything if the mft record is already marked dirty.
+ */
+static inline void mark_mft_record_dirty(ntfs_inode *ni)
+{
+        if (!NInoTestSetDirty(ni))
+                __mark_mft_record_dirty(ni);
+}
+extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
+                MFT_RECORD *m, int sync);
+extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync);
+/**
+ * write_mft_record - write out a mapped (extent) mft record
+ * @ni:         ntfs inode describing the mapped (extent) mft record
+ * @m:          mapped (extent) mft record to write
+ * @sync:       if true, wait for i/o completion
+ *
+ * This is just a wrapper for write_mft_record_nolock() (see mft.c), which
+ * locks the page for the duration of the write.  This ensures that there are
+ * no race conditions between writing the mft record via the dirty inode code
+ * paths and via the page cache write back code paths or between writing
+ * neighbouring mft records residing in the same page.
+ *
+ * Locking the page also serializes us against ->readpage() if the page is not
+ * uptodate.
+ *
+ * On success, clean the mft record and return 0.  On error, leave the mft
+ * record dirty and return -errno.  The caller should call make_bad_inode() on
+ * the base inode to ensure no more access happens to this inode.  We do not do
+ * it here as the caller may want to finish writing other extent mft records
+ * first to minimize on-disk metadata inconsistencies.
+ */
+static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync)
+{
+        struct page *page = ni->page;
+        int err;
+        BUG_ON(!page);
+        lock_page(page);
+        err = write_mft_record_nolock(ni, m, sync);
+        unlock_page(page);
+        return err;
+}
+extern BOOL ntfs_may_write_mft_record(ntfs_volume *vol,
+                const unsigned long mft_no, const MFT_RECORD *m,
+                ntfs_inode **locked_ni);
+extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
+                ntfs_inode *base_ni, MFT_RECORD **mrec);
+extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m);
+#endif /* NTFS_RW */
+#endif /* _LINUX_NTFS_MFT_H */
diff --git a/fs/ntfs/mst.c b/fs/ntfs/mst.c
new file mode 100644
index 000000000000..5a858d839b65
--- /dev/null
+++ b/fs/ntfs/mst.c
@@ -0,0 +1,203 @@
+/*
+ * mst.c - NTFS multi sector transfer protection handling code. Part of the
+ *         Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include "ntfs.h"
+/**
+ * post_read_mst_fixup - deprotect multi sector transfer protected data
+ * @b:          pointer to the data to deprotect
+ * @size:       size in bytes of @b
+ *
+ * Perform the necessary post read multi sector transfer fixup and detect the
+ * presence of incomplete multi sector transfers. - In that case, overwrite the
+ * magic of the ntfs record header being processed with "BAAD" (in memory only!)
+ * and abort processing.
+ *
+ * Return 0 on success and -EINVAL on error ("BAAD" magic will be present).
+ *
+ * NOTE: We consider the absence / invalidity of an update sequence array to
+ * mean that the structure is not protected at all and hence doesn't need to
+ * be fixed up. Thus, we return success and not failure in this case. This is
+ * in contrast to pre_write_mst_fixup(), see below.
+ */
+int post_read_mst_fixup(NTFS_RECORD *b, const u32 size)
+{
+        u16 usa_ofs, usa_count, usn;
+        u16 *usa_pos, *data_pos;
+        /* Setup the variables. */
+        usa_ofs = le16_to_cpu(b->usa_ofs);
+        /* Decrement usa_count to get number of fixups. */
+        usa_count = le16_to_cpu(b->usa_count) - 1;
+        /* Size and alignment checks. */
+        if ( size & (NTFS_BLOCK_SIZE - 1)       ||
+             usa_ofs & 1                        ||
+             usa_ofs + (usa_count * 2) > size   ||
+             (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
+                return 0;
+        /* Position of usn in update sequence array. */
+        usa_pos = (u16*)b + usa_ofs/sizeof(u16);
+        /*
+         * The update sequence number which has to be equal to each of the
+         * u16 values before they are fixed up. Note no need to care for
+         * endianness since we are comparing and moving data for on disk
+         * structures which means the data is consistent. - If it is
+         * consistenty the wrong endianness it doesn't make any difference.
+         */
+        usn = *usa_pos;
+        /*
+         * Position in protected data of first u16 that needs fixing up.
+         */
+        data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
+        /*
+         * Check for incomplete multi sector transfer(s).
+         */
+        while (usa_count--) {
+                if (*data_pos != usn) {
+                        /*
+                         * Incomplete multi sector transfer detected! )-:
+                         * Set the magic to "BAAD" and return failure.
+                         * Note that magic_BAAD is already converted to le32.
+                         */
+                        b->magic = magic_BAAD;
+                        return -EINVAL;
+                }
+                data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
+        }
+        /* Re-setup the variables. */
+        usa_count = le16_to_cpu(b->usa_count) - 1;
+        data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
+        /* Fixup all sectors. */
+        while (usa_count--) {
+                /*
+                 * Increment position in usa and restore original data from
+                 * the usa into the data buffer.
+                 */
+                *data_pos = *(++usa_pos);
+                /* Increment position in data as well. */
+                data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
+        }
+        return 0;
+}
+/**
+ * pre_write_mst_fixup - apply multi sector transfer protection
+ * @b:          pointer to the data to protect
+ * @size:       size in bytes of @b
+ *
+ * Perform the necessary pre write multi sector transfer fixup on the data
+ * pointer to by @b of @size.
+ *
+ * Return 0 if fixup applied (success) or -EINVAL if no fixup was performed
+ * (assumed not needed). This is in contrast to post_read_mst_fixup() above.
+ *
+ * NOTE: We consider the absence / invalidity of an update sequence array to
+ * mean that the structure is not subject to protection and hence doesn't need
+ * to be fixed up. This means that you have to create a valid update sequence
+ * array header in the ntfs record before calling this function, otherwise it
+ * will fail (the header needs to contain the position of the update sequence
+ * array together with the number of elements in the array). You also need to
+ * initialise the update sequence number before calling this function
+ * otherwise a random word will be used (whatever was in the record at that
+ * position at that time).
+ */
+int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size)
+{
+        le16 *usa_pos, *data_pos;
+        u16 usa_ofs, usa_count, usn;
+        le16 le_usn;
+        /* Sanity check + only fixup if it makes sense. */
+        if (!b || ntfs_is_baad_record(b->magic) ||
+                        ntfs_is_hole_record(b->magic))
+                return -EINVAL;
+        /* Setup the variables. */
+        usa_ofs = le16_to_cpu(b->usa_ofs);
+        /* Decrement usa_count to get number of fixups. */
+        usa_count = le16_to_cpu(b->usa_count) - 1;
+        /* Size and alignment checks. */
+        if ( size & (NTFS_BLOCK_SIZE - 1)       ||
+             usa_ofs & 1                        ||
+             usa_ofs + (usa_count * 2) > size   ||
+             (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
+                return -EINVAL;
+        /* Position of usn in update sequence array. */
+        usa_pos = (le16*)((u8*)b + usa_ofs);
+        /*
+         * Cyclically increment the update sequence number
+         * (skipping 0 and -1, i.e. 0xffff).
+         */
+        usn = le16_to_cpup(usa_pos) + 1;
+        if (usn == 0xffff || !usn)
+                usn = 1;
+        le_usn = cpu_to_le16(usn);
+        *usa_pos = le_usn;
+        /* Position in data of first u16 that needs fixing up. */
+        data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
+        /* Fixup all sectors. */
+        while (usa_count--) {
+                /*
+                 * Increment the position in the usa and save the
+                 * original data from the data buffer into the usa.
+                 */
+                *(++usa_pos) = *data_pos;
+                /* Apply fixup to data. */
+                *data_pos = le_usn;
+                /* Increment position in data as well. */
+                data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
+        }
+        return 0;
+}
+/**
+ * post_write_mst_fixup - fast deprotect multi sector transfer protected data
+ * @b:          pointer to the data to deprotect
+ *
+ * Perform the necessary post write multi sector transfer fixup, not checking
+ * for any errors, because we assume we have just used pre_write_mst_fixup(),
+ * thus the data will be fine or we would never have gotten here.
+ */
+void post_write_mst_fixup(NTFS_RECORD *b)
+{
+        le16 *usa_pos, *data_pos;
+        u16 usa_ofs = le16_to_cpu(b->usa_ofs);
+        u16 usa_count = le16_to_cpu(b->usa_count) - 1;
+        /* Position of usn in update sequence array. */
+        usa_pos = (le16*)b + usa_ofs/sizeof(le16);
+        /* Position in protected data of first u16 that needs fixing up. */
+        data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
+        /* Fixup all sectors. */
+        while (usa_count--) {
+                /*
+                 * Increment position in usa and restore original data from
+                 * the usa into the data buffer.
+                 */
+                *data_pos = *(++usa_pos);
+                /* Increment position in data as well. */
+                data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
+        }
+}
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
new file mode 100644
index 000000000000..7c7e13b43b2e
--- /dev/null
+++ b/fs/ntfs/namei.c
@@ -0,0 +1,498 @@
+/*
+ * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS
+ *           project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/dcache.h>
+#include <linux/security.h>
+#include "attrib.h"
+#include "debug.h"
+#include "dir.h"
+#include "mft.h"
+#include "ntfs.h"
+/**
+ * ntfs_lookup - find the inode represented by a dentry in a directory inode
+ * @dir_ino:    directory inode in which to look for the inode
+ * @dent:       dentry representing the inode to look for
+ * @nd:         lookup nameidata
+ *
+ * In short, ntfs_lookup() looks for the inode represented by the dentry @dent
+ * in the directory inode @dir_ino and if found attaches the inode to the
+ * dentry @dent.
+ *
+ * In more detail, the dentry @dent specifies which inode to look for by
+ * supplying the name of the inode in @dent->d_name.name. ntfs_lookup()
+ * converts the name to Unicode and walks the contents of the directory inode
+ * @dir_ino looking for the converted Unicode name. If the name is found in the
+ * directory, the corresponding inode is loaded by calling ntfs_iget() on its
+ * inode number and the inode is associated with the dentry @dent via a call to
+ * d_splice_alias().
+ *
+ * If the name is not found in the directory, a NULL inode is inserted into the
+ * dentry @dent via a call to d_add(). The dentry is then termed a negative
+ * dentry.
+ *
+ * Only if an actual error occurs, do we return an error via ERR_PTR().
+ *
+ * In order to handle the case insensitivity issues of NTFS with regards to the
+ * dcache and the dcache requiring only one dentry per directory, we deal with
+ * dentry aliases that only differ in case in ->ntfs_lookup() while maintaining
+ * a case sensitive dcache. This means that we get the full benefit of dcache
+ * speed when the file/directory is looked up with the same case as returned by
+ * ->ntfs_readdir() but that a lookup for any other case (or for the short file
+ * name) will not find anything in dcache and will enter ->ntfs_lookup()
+ * instead, where we search the directory for a fully matching file name
+ * (including case) and if that is not found, we search for a file name that
+ * matches with different case and if that has non-POSIX semantics we return
+ * that. We actually do only one search (case sensitive) and keep tabs on
+ * whether we have found a case insensitive match in the process.
+ *
+ * To simplify matters for us, we do not treat the short vs long filenames as
+ * two hard links but instead if the lookup matches a short filename, we
+ * return the dentry for the corresponding long filename instead.
+ *
+ * There are three cases we need to distinguish here:
+ *
+ * 1) @dent perfectly matches (i.e. including case) a directory entry with a
+ *    file name in the WIN32 or POSIX namespaces. In this case
+ *    ntfs_lookup_inode_by_name() will return with name set to NULL and we
+ *    just d_splice_alias() @dent.
+ * 2) @dent matches (not including case) a directory entry with a file name in
+ *    the WIN32 namespace. In this case ntfs_lookup_inode_by_name() will return
+ *    with name set to point to a kmalloc()ed ntfs_name structure containing
+ *    the properly cased little endian Unicode name. We convert the name to the
+ *    current NLS code page, search if a dentry with this name already exists
+ *    and if so return that instead of @dent.  At this point things are
+ *    complicated by the possibility of 'disconnected' dentries due to NFS
+ *    which we deal with appropriately (see the code comments).  The VFS will
+ *    then destroy the old @dent and use the one we returned.  If a dentry is
+ *    not found, we allocate a new one, d_splice_alias() it, and return it as
+ *    above.
+ * 3) @dent matches either perfectly or not (i.e. we don't care about case) a
+ *    directory entry with a file name in the DOS namespace. In this case
+ *    ntfs_lookup_inode_by_name() will return with name set to point to a
+ *    kmalloc()ed ntfs_name structure containing the mft reference (cpu endian)
+ *    of the inode. We use the mft reference to read the inode and to find the
+ *    file name in the WIN32 namespace corresponding to the matched short file
+ *    name. We then convert the name to the current NLS code page, and proceed
+ *    searching for a dentry with this name, etc, as in case 2), above.
+ *
+ * Locking: Caller must hold i_sem on the directory.
+ */
+static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
+                struct nameidata *nd)
+{
+        ntfs_volume *vol = NTFS_SB(dir_ino->i_sb);
+        struct inode *dent_inode;
+        ntfschar *uname;
+        ntfs_name *name = NULL;
+        MFT_REF mref;
+        unsigned long dent_ino;
+        int uname_len;
+        ntfs_debug("Looking up %s in directory inode 0x%lx.",
+                        dent->d_name.name, dir_ino->i_ino);
+        /* Convert the name of the dentry to Unicode. */
+        uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len,
+                        &uname);
+        if (uname_len < 0) {
+                ntfs_error(vol->sb, "Failed to convert name to Unicode.");
+                return ERR_PTR(uname_len);
+        }
+        mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len,
+                        &name);
+        kmem_cache_free(ntfs_name_cache, uname);
+        if (!IS_ERR_MREF(mref)) {
+                dent_ino = MREF(mref);
+                ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino);
+                dent_inode = ntfs_iget(vol->sb, dent_ino);
+                if (likely(!IS_ERR(dent_inode))) {
+                        /* Consistency check. */
+                        if (is_bad_inode(dent_inode) || MSEQNO(mref) ==
+                                        NTFS_I(dent_inode)->seq_no ||
+                                        dent_ino == FILE_MFT) {
+                                /* Perfect WIN32/POSIX match. -- Case 1. */
+                                if (!name) {
+                                        ntfs_debug("Done.  (Case 1.)");
+                                        return d_splice_alias(dent_inode, dent);
+                                }
+                                /*
+                                 * We are too indented.  Handle imperfect
+                                 * matches and short file names further below.
+                                 */
+                                goto handle_name;
+                        }
+                        ntfs_error(vol->sb, "Found stale reference to inode "
+                                        "0x%lx (reference sequence number = "
+                                        "0x%x, inode sequence number = 0x%x), "
+                                        "returning -EIO. Run chkdsk.",
+                                        dent_ino, MSEQNO(mref),
+                                        NTFS_I(dent_inode)->seq_no);
+                        iput(dent_inode);
+                        dent_inode = ERR_PTR(-EIO);
+                } else
+                        ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with "
+                                        "error code %li.", dent_ino,
+                                        PTR_ERR(dent_inode));
+                if (name)
+                        kfree(name);
+                /* Return the error code. */
+                return (struct dentry *)dent_inode;
+        }
+        /* It is guaranteed that name is no longer allocated at this point. */
+        if (MREF_ERR(mref) == -ENOENT) {
+                ntfs_debug("Entry was not found, adding negative dentry.");
+                /* The dcache will handle negative entries. */
+                d_add(dent, NULL);
+                ntfs_debug("Done.");
+                return NULL;
+        }
+        ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error "
+                        "code %i.", -MREF_ERR(mref));
+        return ERR_PTR(MREF_ERR(mref));
+        // TODO: Consider moving this lot to a separate function! (AIA)
+handle_name:
+   {
+        struct dentry *real_dent, *new_dent;
+        MFT_RECORD *m;
+        ntfs_attr_search_ctx *ctx;
+        ntfs_inode *ni = NTFS_I(dent_inode);
+        int err;
+        struct qstr nls_name;
+        nls_name.name = NULL;
+        if (name->type != FILE_NAME_DOS) {                      /* Case 2. */
+                ntfs_debug("Case 2.");
+                nls_name.len = (unsigned)ntfs_ucstonls(vol,
+                                (ntfschar*)&name->name, name->len,
+                                (unsigned char**)&nls_name.name, 0);
+                kfree(name);
+        } else /* if (name->type == FILE_NAME_DOS) */ {         /* Case 3. */
+                FILE_NAME_ATTR *fn;
+                ntfs_debug("Case 3.");
+                kfree(name);
+                /* Find the WIN32 name corresponding to the matched DOS name. */
+                ni = NTFS_I(dent_inode);
+                m = map_mft_record(ni);
+                if (IS_ERR(m)) {
+                        err = PTR_ERR(m);
+                        m = NULL;
+                        ctx = NULL;
+                        goto err_out;
+                }
+                ctx = ntfs_attr_get_search_ctx(ni, m);
+                if (unlikely(!ctx)) {
+                        err = -ENOMEM;
+                        goto err_out;
+                }
+                do {
+                        ATTR_RECORD *a;
+                        u32 val_len;
+                        err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0,
+                                        NULL, 0, ctx);
+                        if (unlikely(err)) {
+                                ntfs_error(vol->sb, "Inode corrupt: No WIN32 "
+                                                "namespace counterpart to DOS "
+                                                "file name. Run chkdsk.");
+                                if (err == -ENOENT)
+                                        err = -EIO;
+                                goto err_out;
+                        }
+                        /* Consistency checks. */
+                        a = ctx->attr;
+                        if (a->non_resident || a->flags)
+                                goto eio_err_out;
+                        val_len = le32_to_cpu(a->data.resident.value_length);
+                        if (le16_to_cpu(a->data.resident.value_offset) +
+                                        val_len > le32_to_cpu(a->length))
+                                goto eio_err_out;
+                        fn = (FILE_NAME_ATTR*)((u8*)ctx->attr + le16_to_cpu(
+                                        ctx->attr->data.resident.value_offset));
+                        if ((u32)(fn->file_name_length * sizeof(ntfschar) +
+                                        sizeof(FILE_NAME_ATTR)) > val_len)
+                                goto eio_err_out;
+                } while (fn->file_name_type != FILE_NAME_WIN32);
+                /* Convert the found WIN32 name to current NLS code page. */
+                nls_name.len = (unsigned)ntfs_ucstonls(vol,
+                                (ntfschar*)&fn->file_name, fn->file_name_length,
+                                (unsigned char**)&nls_name.name, 0);
+                ntfs_attr_put_search_ctx(ctx);
+                unmap_mft_record(ni);
+        }
+        m = NULL;
+        ctx = NULL;
+        /* Check if a conversion error occurred. */
+        if ((signed)nls_name.len < 0) {
+                err = (signed)nls_name.len;
+                goto err_out;
+        }
+        nls_name.hash = full_name_hash(nls_name.name, nls_name.len);
+        /*
+         * Note: No need for dent->d_lock lock as i_sem is held on the
+         * parent inode.
+         */
+        /* Does a dentry matching the nls_name exist already? */
+        real_dent = d_lookup(dent->d_parent, &nls_name);
+        /* If not, create it now. */
+        if (!real_dent) {
+                real_dent = d_alloc(dent->d_parent, &nls_name);
+                kfree(nls_name.name);
+                if (!real_dent) {
+                        err = -ENOMEM;
+                        goto err_out;
+                }
+                new_dent = d_splice_alias(dent_inode, real_dent);
+                if (new_dent)
+                        dput(real_dent);
+                else
+                        new_dent = real_dent;
+                ntfs_debug("Done.  (Created new dentry.)");
+                return new_dent;
+        }
+        kfree(nls_name.name);
+        /* Matching dentry exists, check if it is negative. */
+        if (real_dent->d_inode) {
+                if (unlikely(real_dent->d_inode != dent_inode)) {
+                        /* This can happen because bad inodes are unhashed. */
+                        BUG_ON(!is_bad_inode(dent_inode));
+                        BUG_ON(!is_bad_inode(real_dent->d_inode));
+                }
+                /*
+                 * Already have the inode and the dentry attached, decrement
+                 * the reference count to balance the ntfs_iget() we did
+                 * earlier on.  We found the dentry using d_lookup() so it
+                 * cannot be disconnected and thus we do not need to worry
+                 * about any NFS/disconnectedness issues here.
+                 */
+                iput(dent_inode);
+                ntfs_debug("Done.  (Already had inode and dentry.)");
+                return real_dent;
+        }
+        /*
+         * Negative dentry: instantiate it unless the inode is a directory and
+         * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED),
+         * in which case d_move() that in place of the found dentry.
+         */
+        if (!S_ISDIR(dent_inode->i_mode)) {
+                /* Not a directory; everything is easy. */
+                d_instantiate(real_dent, dent_inode);
+                ntfs_debug("Done.  (Already had negative file dentry.)");
+                return real_dent;
+        }
+        spin_lock(&dcache_lock);
+        if (list_empty(&dent_inode->i_dentry)) {
+                /*
+                 * Directory without a 'disconnected' dentry; we need to do
+                 * d_instantiate() by hand because it takes dcache_lock which
+                 * we already hold.
+                 */
+                list_add(&real_dent->d_alias, &dent_inode->i_dentry);
+                real_dent->d_inode = dent_inode;
+                spin_unlock(&dcache_lock);
+                security_d_instantiate(real_dent, dent_inode);
+                ntfs_debug("Done.  (Already had negative directory dentry.)");
+                return real_dent;
+        }
+        /*
+         * Directory with a 'disconnected' dentry; get a reference to the
+         * 'disconnected' dentry.
+         */
+        new_dent = list_entry(dent_inode->i_dentry.next, struct dentry,
+                        d_alias);
+        dget_locked(new_dent);
+        spin_unlock(&dcache_lock);
+        /* Do security vodoo. */
+        security_d_instantiate(real_dent, dent_inode);
+        /* Move new_dent in place of real_dent. */
+        d_move(new_dent, real_dent);
+        /* Balance the ntfs_iget() we did above. */
+        iput(dent_inode);
+        /* Throw away real_dent. */
+        dput(real_dent);
+        /* Use new_dent as the actual dentry. */
+        ntfs_debug("Done.  (Already had negative, disconnected directory "
+                        "dentry.)");
+        return new_dent;
+eio_err_out:
+        ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk.");
+        err = -EIO;
+err_out:
+        if (ctx)
+                ntfs_attr_put_search_ctx(ctx);
+        if (m)
+                unmap_mft_record(ni);
+        iput(dent_inode);
+        ntfs_error(vol->sb, "Failed, returning error code %i.", err);
+        return ERR_PTR(err);
+   }
+}
+/**
+ * Inode operations for directories.
+ */
+struct inode_operations ntfs_dir_inode_ops = {
+        .lookup = ntfs_lookup,  /* VFS: Lookup directory. */
+};
+/**
+ * ntfs_get_parent - find the dentry of the parent of a given directory dentry
+ * @child_dent:         dentry of the directory whose parent directory to find
+ *
+ * Find the dentry for the parent directory of the directory specified by the
+ * dentry @child_dent.  This function is called from
+ * fs/exportfs/expfs.c::find_exported_dentry() which in turn is called from the
+ * default ->decode_fh() which is export_decode_fh() in the same file.
+ *
+ * The code is based on the ext3 ->get_parent() implementation found in
+ * fs/ext3/namei.c::ext3_get_parent().
+ *
+ * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_sem down.
+ *
+ * Return the dentry of the parent directory on success or the error code on
+ * error (IS_ERR() is true).
+ */
+struct dentry *ntfs_get_parent(struct dentry *child_dent)
+{
+        struct inode *vi = child_dent->d_inode;
+        ntfs_inode *ni = NTFS_I(vi);
+        MFT_RECORD *mrec;
+        ntfs_attr_search_ctx *ctx;
+        ATTR_RECORD *attr;
+        FILE_NAME_ATTR *fn;
+        struct inode *parent_vi;
+        struct dentry *parent_dent;
+        unsigned long parent_ino;
+        int err;
+        ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+        /* Get the mft record of the inode belonging to the child dentry. */
+        mrec = map_mft_record(ni);
+        if (IS_ERR(mrec))
+                return (struct dentry *)mrec;
+        /* Find the first file name attribute in the mft record. */
+        ctx = ntfs_attr_get_search_ctx(ni, mrec);
+        if (unlikely(!ctx)) {
+                unmap_mft_record(ni);
+                return ERR_PTR(-ENOMEM);
+        }
+try_next:
+        err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, CASE_SENSITIVE, 0, NULL,
+                        0, ctx);
+        if (unlikely(err)) {
+                ntfs_attr_put_search_ctx(ctx);
+                unmap_mft_record(ni);
+                if (err == -ENOENT)
+                        ntfs_error(vi->i_sb, "Inode 0x%lx does not have a "
+                                        "file name attribute.  Run chkdsk.",
+                                        vi->i_ino);
+                return ERR_PTR(err);
+        }
+        attr = ctx->attr;
+        if (unlikely(attr->non_resident))
+                goto try_next;
+        fn = (FILE_NAME_ATTR *)((u8 *)attr +
+                        le16_to_cpu(attr->data.resident.value_offset));
+        if (unlikely((u8 *)fn + le32_to_cpu(attr->data.resident.value_length) >
+                        (u8*)attr + le32_to_cpu(attr->length)))
+                goto try_next;
+        /* Get the inode number of the parent directory. */
+        parent_ino = MREF_LE(fn->parent_directory);
+        /* Release the search context and the mft record of the child. */
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(ni);
+        /* Get the inode of the parent directory. */
+        parent_vi = ntfs_iget(vi->i_sb, parent_ino);
+        if (IS_ERR(parent_vi) || unlikely(is_bad_inode(parent_vi))) {
+                if (!IS_ERR(parent_vi))
+                        iput(parent_vi);
+                ntfs_error(vi->i_sb, "Failed to get parent directory inode "
+                                "0x%lx of child inode 0x%lx.", parent_ino,
+                                vi->i_ino);
+                return ERR_PTR(-EACCES);
+        }
+        /* Finally get a dentry for the parent directory and return it. */
+        parent_dent = d_alloc_anon(parent_vi);
+        if (unlikely(!parent_dent)) {
+                iput(parent_vi);
+                return ERR_PTR(-ENOMEM);
+        }
+        ntfs_debug("Done for inode 0x%lx.", vi->i_ino);
+        return parent_dent;
+}
+/**
+ * ntfs_get_dentry - find a dentry for the inode from a file handle sub-fragment
+ * @sb:         super block identifying the mounted ntfs volume
+ * @fh:         the file handle sub-fragment
+ *
+ * Find a dentry for the inode given a file handle sub-fragment.  This function
+ * is called from fs/exportfs/expfs.c::find_exported_dentry() which in turn is
+ * called from the default ->decode_fh() which is export_decode_fh() in the
+ * same file.  The code is closely based on the default ->get_dentry() helper
+ * fs/exportfs/expfs.c::get_object().
+ *
+ * The @fh contains two 32-bit unsigned values, the first one is the inode
+ * number and the second one is the inode generation.
+ *
+ * Return the dentry on success or the error code on error (IS_ERR() is true).
+ */
+struct dentry *ntfs_get_dentry(struct super_block *sb, void *fh)
+{
+        struct inode *vi;
+        struct dentry *dent;
+        unsigned long ino = ((u32 *)fh)[0];
+        u32 gen = ((u32 *)fh)[1];
+        ntfs_debug("Entering for inode 0x%lx, generation 0x%x.", ino, gen);
+        vi = ntfs_iget(sb, ino);
+        if (IS_ERR(vi)) {
+                ntfs_error(sb, "Failed to get inode 0x%lx.", ino);
+                return (struct dentry *)vi;
+        }
+        if (unlikely(is_bad_inode(vi) || vi->i_generation != gen)) {
+                /* We didn't find the right inode. */
+                ntfs_error(sb, "Inode 0x%lx, bad count: %d %d or version 0x%x "
+                                "0x%x.", vi->i_ino, vi->i_nlink,
+                                atomic_read(&vi->i_count), vi->i_generation,
+                                gen);
+                iput(vi);
+                return ERR_PTR(-ESTALE);
+        }
+        /* Now find a dentry.  If possible, get a well-connected one. */
+        dent = d_alloc_anon(vi);
+        if (unlikely(!dent)) {
+                iput(vi);
+                return ERR_PTR(-ENOMEM);
+        }
+        ntfs_debug("Done for inode 0x%lx, generation 0x%x.", ino, gen);
+        return dent;
+}
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
new file mode 100644
index 000000000000..720ffb71bab8
--- /dev/null
+++ b/fs/ntfs/ntfs.h
@@ -0,0 +1,129 @@
+/*
+ * ntfs.h - Defines for NTFS Linux kernel driver. Part of the Linux-NTFS
+ *          project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (C) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_H
+#define _LINUX_NTFS_H
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/nls.h>
+#include <linux/smp.h>
+#include "types.h"
+#include "volume.h"
+#include "layout.h"
+typedef enum {
+        NTFS_BLOCK_SIZE         = 512,
+        NTFS_BLOCK_SIZE_BITS    = 9,
+        NTFS_SB_MAGIC           = 0x5346544e,   /* 'NTFS' */
+        NTFS_MAX_NAME_LEN       = 255,
+} NTFS_CONSTANTS;
+/* Global variables. */
+/* Slab caches (from super.c). */
+extern kmem_cache_t *ntfs_name_cache;
+extern kmem_cache_t *ntfs_inode_cache;
+extern kmem_cache_t *ntfs_big_inode_cache;
+extern kmem_cache_t *ntfs_attr_ctx_cache;
+extern kmem_cache_t *ntfs_index_ctx_cache;
+/* The various operations structs defined throughout the driver files. */
+extern struct address_space_operations ntfs_aops;
+extern struct address_space_operations ntfs_mst_aops;
+extern struct  file_operations ntfs_file_ops;
+extern struct inode_operations ntfs_file_inode_ops;
+extern struct  file_operations ntfs_dir_ops;
+extern struct inode_operations ntfs_dir_inode_ops;
+extern struct  file_operations ntfs_empty_file_ops;
+extern struct inode_operations ntfs_empty_inode_ops;
+/**
+ * NTFS_SB - return the ntfs volume given a vfs super block
+ * @sb:         VFS super block
+ *
+ * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb.
+ */
+static inline ntfs_volume *NTFS_SB(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+/* Declarations of functions and global variables. */
+/* From fs/ntfs/compress.c */
+extern int ntfs_read_compressed_block(struct page *page);
+extern int allocate_compression_buffers(void);
+extern void free_compression_buffers(void);
+/* From fs/ntfs/super.c */
+#define default_upcase_len 0x10000
+extern struct semaphore ntfs_lock;
+typedef struct {
+        int val;
+        char *str;
+} option_t;
+extern const option_t on_errors_arr[];
+/* From fs/ntfs/mst.c */
+extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size);
+extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size);
+extern void post_write_mst_fixup(NTFS_RECORD *b);
+/* From fs/ntfs/unistr.c */
+extern BOOL ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
+                const ntfschar *s2, size_t s2_len,
+                const IGNORE_CASE_BOOL ic,
+                const ntfschar *upcase, const u32 upcase_size);
+extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
+                const ntfschar *name2, const u32 name2_len,
+                const int err_val, const IGNORE_CASE_BOOL ic,
+                const ntfschar *upcase, const u32 upcase_len);
+extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n);
+extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
+                const ntfschar *upcase, const u32 upcase_size);
+extern void ntfs_upcase_name(ntfschar *name, u32 name_len,
+                const ntfschar *upcase, const u32 upcase_len);
+extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
+                const ntfschar *upcase, const u32 upcase_len);
+extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
+                FILE_NAME_ATTR *file_name_attr2,
+                const int err_val, const IGNORE_CASE_BOOL ic,
+                const ntfschar *upcase, const u32 upcase_len);
+extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
+                const int ins_len, ntfschar **outs);
+extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
+                const int ins_len, unsigned char **outs, int outs_len);
+/* From fs/ntfs/upcase.c */
+extern ntfschar *generate_default_upcase(void);
+#endif /* _LINUX_NTFS_H */
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
new file mode 100644
index 000000000000..833df2a4e9fb
--- /dev/null
+++ b/fs/ntfs/quota.c
@@ -0,0 +1,117 @@
+/*
+ * quota.c - NTFS kernel quota ($Quota) handling.  Part of the Linux-NTFS
+ *           project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifdef NTFS_RW
+#include "index.h"
+#include "quota.h"
+#include "debug.h"
+#include "ntfs.h"
+/**
+ * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume
+ * @vol:        ntfs volume on which to mark the quotas out of date
+ *
+ * Mark the quotas out of date on the ntfs volume @vol and return TRUE on
+ * success and FALSE on error.
+ */
+BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
+{
+        ntfs_index_context *ictx;
+        QUOTA_CONTROL_ENTRY *qce;
+        const le32 qid = QUOTA_DEFAULTS_ID;
+        int err;
+        ntfs_debug("Entering.");
+        if (NVolQuotaOutOfDate(vol))
+                goto done;
+        if (!vol->quota_ino || !vol->quota_q_ino) {
+                ntfs_error(vol->sb, "Quota inodes are not open.");
+                return FALSE;
+        }
+        down(&vol->quota_q_ino->i_sem);
+        ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
+        if (!ictx) {
+                ntfs_error(vol->sb, "Failed to get index context.");
+                goto err_out;
+        }
+        err = ntfs_index_lookup(&qid, sizeof(qid), ictx);
+        if (err) {
+                if (err == -ENOENT)
+                        ntfs_error(vol->sb, "Quota defaults entry is not "
+                                        "present.");
+                else
+                        ntfs_error(vol->sb, "Lookup of quota defaults entry "
+                                        "failed.");
+                goto err_out;
+        }
+        if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) {
+                ntfs_error(vol->sb, "Quota defaults entry size is invalid.  "
+                                "Run chkdsk.");
+                goto err_out;
+        }
+        qce = (QUOTA_CONTROL_ENTRY*)ictx->data;
+        if (le32_to_cpu(qce->version) != QUOTA_VERSION) {
+                ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not "
+                                "supported.", le32_to_cpu(qce->version));
+                goto err_out;
+        }
+        ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags));
+        /* If quotas are already marked out of date, no need to do anything. */
+        if (qce->flags & QUOTA_FLAG_OUT_OF_DATE)
+                goto set_done;
+        /*
+         * If quota tracking is neither requested, nor enabled and there are no
+         * pending deletes, no need to mark the quotas out of date.
+         */
+        if (!(qce->flags & (QUOTA_FLAG_TRACKING_ENABLED |
+                        QUOTA_FLAG_TRACKING_REQUESTED |
+                        QUOTA_FLAG_PENDING_DELETES)))
+                goto set_done;
+        /*
+         * Set the QUOTA_FLAG_OUT_OF_DATE bit thus marking quotas out of date.
+         * This is verified on WinXP to be sufficient to cause windows to
+         * rescan the volume on boot and update all quota entries.
+         */
+        qce->flags |= QUOTA_FLAG_OUT_OF_DATE;
+        /* Ensure the modified flags are written to disk. */
+        ntfs_index_entry_flush_dcache_page(ictx);
+        ntfs_index_entry_mark_dirty(ictx);
+set_done:
+        ntfs_index_ctx_put(ictx);
+        up(&vol->quota_q_ino->i_sem);
+        /*
+         * We set the flag so we do not try to mark the quotas out of date
+         * again on remount.
+         */
+        NVolSetQuotaOutOfDate(vol);
+done:
+        ntfs_debug("Done.");
+        return TRUE;
+err_out:
+        if (ictx)
+                ntfs_index_ctx_put(ictx);
+        up(&vol->quota_q_ino->i_sem);
+        return FALSE;
+}
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h
new file mode 100644
index 000000000000..40e4763aa222
--- /dev/null
+++ b/fs/ntfs/quota.h
@@ -0,0 +1,35 @@
+/*
+ * quota.h - Defines for NTFS kernel quota ($Quota) handling.  Part of the
+ *           Linux-NTFS project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_QUOTA_H
+#define _LINUX_NTFS_QUOTA_H
+#ifdef NTFS_RW
+#include "types.h"
+#include "volume.h"
+extern BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol);
+#endif /* NTFS_RW */
+#endif /* _LINUX_NTFS_QUOTA_H */
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
new file mode 100644
index 000000000000..8438fb1da219
--- /dev/null
+++ b/fs/ntfs/runlist.c
@@ -0,0 +1,1438 @@
+/**
+ * runlist.c - NTFS runlist handling code.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include "debug.h"
+#include "dir.h"
+#include "endian.h"
+#include "malloc.h"
+#include "ntfs.h"
+/**
+ * ntfs_rl_mm - runlist memmove
+ *
+ * It is up to the caller to serialize access to the runlist @base.
+ */
+static inline void ntfs_rl_mm(runlist_element *base, int dst, int src,
+                int size)
+{
+        if (likely((dst != src) && (size > 0)))
+                memmove(base + dst, base + src, size * sizeof (*base));
+}
+/**
+ * ntfs_rl_mc - runlist memory copy
+ *
+ * It is up to the caller to serialize access to the runlists @dstbase and
+ * @srcbase.
+ */
+static inline void ntfs_rl_mc(runlist_element *dstbase, int dst,
+                runlist_element *srcbase, int src, int size)
+{
+        if (likely(size > 0))
+                memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase));
+}
+/**
+ * ntfs_rl_realloc - Reallocate memory for runlists
+ * @rl:         original runlist
+ * @old_size:   number of runlist elements in the original runlist @rl
+ * @new_size:   number of runlist elements we need space for
+ *
+ * As the runlists grow, more memory will be required.  To prevent the
+ * kernel having to allocate and reallocate large numbers of small bits of
+ * memory, this function returns and entire page of memory.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * N.B.  If the new allocation doesn't require a different number of pages in
+ *       memory, the function will return the original pointer.
+ *
+ * On success, return a pointer to the newly allocated, or recycled, memory.
+ * On error, return -errno. The following error codes are defined:
+ *      -ENOMEM - Not enough memory to allocate runlist array.
+ *      -EINVAL - Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_realloc(runlist_element *rl,
+                int old_size, int new_size)
+{
+        runlist_element *new_rl;
+        old_size = PAGE_ALIGN(old_size * sizeof(*rl));
+        new_size = PAGE_ALIGN(new_size * sizeof(*rl));
+        if (old_size == new_size)
+                return rl;
+        new_rl = ntfs_malloc_nofs(new_size);
+        if (unlikely(!new_rl))
+                return ERR_PTR(-ENOMEM);
+        if (likely(rl != NULL)) {
+                if (unlikely(old_size > new_size))
+                        old_size = new_size;
+                memcpy(new_rl, rl, old_size);
+                ntfs_free(rl);
+        }
+        return new_rl;
+}
+/**
+ * ntfs_are_rl_mergeable - test if two runlists can be joined together
+ * @dst:        original runlist
+ * @src:        new runlist to test for mergeability with @dst
+ *
+ * Test if two runlists can be joined together. For this, their VCNs and LCNs
+ * must be adjacent.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * Return: TRUE   Success, the runlists can be merged.
+ *         FALSE  Failure, the runlists cannot be merged.
+ */
+static inline BOOL ntfs_are_rl_mergeable(runlist_element *dst,
+                runlist_element *src)
+{
+        BUG_ON(!dst);
+        BUG_ON(!src);
+        if ((dst->lcn < 0) || (src->lcn < 0))     /* Are we merging holes? */
+                return FALSE;
+        if ((dst->lcn + dst->length) != src->lcn) /* Are the runs contiguous? */
+                return FALSE;
+        if ((dst->vcn + dst->length) != src->vcn) /* Are the runs misaligned? */
+                return FALSE;
+        return TRUE;
+}
+/**
+ * __ntfs_rl_merge - merge two runlists without testing if they can be merged
+ * @dst:        original, destination runlist
+ * @src:        new runlist to merge with @dst
+ *
+ * Merge the two runlists, writing into the destination runlist @dst. The
+ * caller must make sure the runlists can be merged or this will corrupt the
+ * destination runlist.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ */
+static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src)
+{
+        dst->length += src->length;
+}
+/**
+ * ntfs_rl_append - append a runlist after a given element
+ * @dst:        original runlist to be worked on
+ * @dsize:      number of elements in @dst (including end marker)
+ * @src:        runlist to be inserted into @dst
+ * @ssize:      number of elements in @src (excluding end marker)
+ * @loc:        append the new runlist @src after this element in @dst
+ *
+ * Append the runlist @src after element @loc in @dst.  Merge the right end of
+ * the new runlist, if necessary. Adjust the size of the hole before the
+ * appended runlist.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *      -ENOMEM - Not enough memory to allocate runlist array.
+ *      -EINVAL - Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_append(runlist_element *dst,
+                int dsize, runlist_element *src, int ssize, int loc)
+{
+        BOOL right;
+        int magic;
+        BUG_ON(!dst);
+        BUG_ON(!src);
+        /* First, check if the right hand end needs merging. */
+        right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
+        /* Space required: @dst size + @src size, less one if we merged. */
+        dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right);
+        if (IS_ERR(dst))
+                return dst;
+        /*
+         * We are guaranteed to succeed from here so can start modifying the
+         * original runlists.
+         */
+        /* First, merge the right hand end, if necessary. */
+        if (right)
+                __ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
+        magic = loc + ssize;
+        /* Move the tail of @dst out of the way, then copy in @src. */
+        ntfs_rl_mm(dst, magic + 1, loc + 1 + right, dsize - loc - 1 - right);
+        ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
+        /* Adjust the size of the preceding hole. */
+        dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
+        /* We may have changed the length of the file, so fix the end marker */
+        if (dst[magic + 1].lcn == LCN_ENOENT)
+                dst[magic + 1].vcn = dst[magic].vcn + dst[magic].length;
+        return dst;
+}
+/**
+ * ntfs_rl_insert - insert a runlist into another
+ * @dst:        original runlist to be worked on
+ * @dsize:      number of elements in @dst (including end marker)
+ * @src:        new runlist to be inserted
+ * @ssize:      number of elements in @src (excluding end marker)
+ * @loc:        insert the new runlist @src before this element in @dst
+ *
+ * Insert the runlist @src before element @loc in the runlist @dst. Merge the
+ * left end of the new runlist, if necessary. Adjust the size of the hole
+ * after the inserted runlist.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *      -ENOMEM - Not enough memory to allocate runlist array.
+ *      -EINVAL - Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
+                int dsize, runlist_element *src, int ssize, int loc)
+{
+        BOOL left = FALSE;
+        BOOL disc = FALSE;      /* Discontinuity */
+        BOOL hole = FALSE;      /* Following a hole */
+        int magic;
+        BUG_ON(!dst);
+        BUG_ON(!src);
+        /* disc => Discontinuity between the end of @dst and the start of @src.
+         *         This means we might need to insert a hole.
+         * hole => @dst ends with a hole or an unmapped region which we can
+         *         extend to match the discontinuity. */
+        if (loc == 0)
+                disc = (src[0].vcn > 0);
+        else {
+                s64 merged_length;
+                left = ntfs_are_rl_mergeable(dst + loc - 1, src);
+                merged_length = dst[loc - 1].length;
+                if (left)
+                        merged_length += src->length;
+                disc = (src[0].vcn > dst[loc - 1].vcn + merged_length);
+                if (disc)
+                        hole = (dst[loc - 1].lcn == LCN_HOLE);
+        }
+        /* Space required: @dst size + @src size, less one if we merged, plus
+         * one if there was a discontinuity, less one for a trailing hole. */
+        dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc - hole);
+        if (IS_ERR(dst))
+                return dst;
+        /*
+         * We are guaranteed to succeed from here so can start modifying the
+         * original runlist.
+         */
+        if (left)
+                __ntfs_rl_merge(dst + loc - 1, src);
+        magic = loc + ssize - left + disc - hole;
+        /* Move the tail of @dst out of the way, then copy in @src. */
+        ntfs_rl_mm(dst, magic, loc, dsize - loc);
+        ntfs_rl_mc(dst, loc + disc - hole, src, left, ssize - left);
+        /* Adjust the VCN of the last run ... */
+        if (dst[magic].lcn <= LCN_HOLE)
+                dst[magic].vcn = dst[magic - 1].vcn + dst[magic - 1].length;
+        /* ... and the length. */
+        if (dst[magic].lcn == LCN_HOLE || dst[magic].lcn == LCN_RL_NOT_MAPPED)
+                dst[magic].length = dst[magic + 1].vcn - dst[magic].vcn;
+        /* Writing beyond the end of the file and there's a discontinuity. */
+        if (disc) {
+                if (hole)
+                        dst[loc - 1].length = dst[loc].vcn - dst[loc - 1].vcn;
+                else {
+                        if (loc > 0) {
+                                dst[loc].vcn = dst[loc - 1].vcn +
+                                                dst[loc - 1].length;
+                                dst[loc].length = dst[loc + 1].vcn -
+                                                dst[loc].vcn;
+                        } else {
+                                dst[loc].vcn = 0;
+                                dst[loc].length = dst[loc + 1].vcn;
+                        }
+                        dst[loc].lcn = LCN_RL_NOT_MAPPED;
+                }
+                magic += hole;
+                if (dst[magic].lcn == LCN_ENOENT)
+                        dst[magic].vcn = dst[magic - 1].vcn +
+                                        dst[magic - 1].length;
+        }
+        return dst;
+}
+/**
+ * ntfs_rl_replace - overwrite a runlist element with another runlist
+ * @dst:        original runlist to be worked on
+ * @dsize:      number of elements in @dst (including end marker)
+ * @src:        new runlist to be inserted
+ * @ssize:      number of elements in @src (excluding end marker)
+ * @loc:        index in runlist @dst to overwrite with @src
+ *
+ * Replace the runlist element @dst at @loc with @src. Merge the left and
+ * right ends of the inserted runlist, if necessary.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *      -ENOMEM - Not enough memory to allocate runlist array.
+ *      -EINVAL - Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_replace(runlist_element *dst,
+                int dsize, runlist_element *src, int ssize, int loc)
+{
+        BOOL left = FALSE;
+        BOOL right;
+        int magic;
+        BUG_ON(!dst);
+        BUG_ON(!src);
+        /* First, merge the left and right ends, if necessary. */
+        right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
+        if (loc > 0)
+                left = ntfs_are_rl_mergeable(dst + loc - 1, src);
+        /* Allocate some space. We'll need less if the left, right, or both
+         * ends were merged. */
+        dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left - right);
+        if (IS_ERR(dst))
+                return dst;
+        /*
+         * We are guaranteed to succeed from here so can start modifying the
+         * original runlists.
+         */
+        if (right)
+                __ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
+        if (left)
+                __ntfs_rl_merge(dst + loc - 1, src);
+        /* FIXME: What does this mean? (AIA) */
+        magic = loc + ssize - left;
+        /* Move the tail of @dst out of the way, then copy in @src. */
+        ntfs_rl_mm(dst, magic, loc + right + 1, dsize - loc - right - 1);
+        ntfs_rl_mc(dst, loc, src, left, ssize - left);
+        /* We may have changed the length of the file, so fix the end marker */
+        if (dst[magic].lcn == LCN_ENOENT)
+                dst[magic].vcn = dst[magic - 1].vcn + dst[magic - 1].length;
+        return dst;
+}
+/**
+ * ntfs_rl_split - insert a runlist into the centre of a hole
+ * @dst:        original runlist to be worked on
+ * @dsize:      number of elements in @dst (including end marker)
+ * @src:        new runlist to be inserted
+ * @ssize:      number of elements in @src (excluding end marker)
+ * @loc:        index in runlist @dst at which to split and insert @src
+ *
+ * Split the runlist @dst at @loc into two and insert @new in between the two
+ * fragments. No merging of runlists is necessary. Adjust the size of the
+ * holes either side.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *      -ENOMEM - Not enough memory to allocate runlist array.
+ *      -EINVAL - Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize,
+                runlist_element *src, int ssize, int loc)
+{
+        BUG_ON(!dst);
+        BUG_ON(!src);
+        /* Space required: @dst size + @src size + one new hole. */
+        dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1);
+        if (IS_ERR(dst))
+                return dst;
+        /*
+         * We are guaranteed to succeed from here so can start modifying the
+         * original runlists.
+         */
+        /* Move the tail of @dst out of the way, then copy in @src. */
+        ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc);
+        ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
+        /* Adjust the size of the holes either size of @src. */
+        dst[loc].length         = dst[loc+1].vcn       - dst[loc].vcn;
+        dst[loc+ssize+1].vcn    = dst[loc+ssize].vcn   + dst[loc+ssize].length;
+        dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn;
+        return dst;
+}
+/**
+ * ntfs_runlists_merge - merge two runlists into one
+ * @drl:        original runlist to be worked on
+ * @srl:        new runlist to be merged into @drl
+ *
+ * First we sanity check the two runlists @srl and @drl to make sure that they
+ * are sensible and can be merged. The runlist @srl must be either after the
+ * runlist @drl or completely within a hole (or unmapped region) in @drl.
+ *
+ * It is up to the caller to serialize access to the runlists @drl and @srl.
+ *
+ * Merging of runlists is necessary in two cases:
+ *   1. When attribute lists are used and a further extent is being mapped.
+ *   2. When new clusters are allocated to fill a hole or extend a file.
+ *
+ * There are four possible ways @srl can be merged. It can:
+ *      - be inserted at the beginning of a hole,
+ *      - split the hole in two and be inserted between the two fragments,
+ *      - be appended at the end of a hole, or it can
+ *      - replace the whole hole.
+ * It can also be appended to the end of the runlist, which is just a variant
+ * of the insert case.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @drl and @srl are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *      -ENOMEM - Not enough memory to allocate runlist array.
+ *      -EINVAL - Invalid parameters were passed in.
+ *      -ERANGE - The runlists overlap and cannot be merged.
+ */
+runlist_element *ntfs_runlists_merge(runlist_element *drl,
+                runlist_element *srl)
+{
+        int di, si;             /* Current index into @[ds]rl. */
+        int sstart;             /* First index with lcn > LCN_RL_NOT_MAPPED. */
+        int dins;               /* Index into @drl at which to insert @srl. */
+        int dend, send;         /* Last index into @[ds]rl. */
+        int dfinal, sfinal;     /* The last index into @[ds]rl with
+                                   lcn >= LCN_HOLE. */
+        int marker = 0;
+        VCN marker_vcn = 0;
+#ifdef DEBUG
+        ntfs_debug("dst:");
+        ntfs_debug_dump_runlist(drl);
+        ntfs_debug("src:");
+        ntfs_debug_dump_runlist(srl);
+#endif
+        /* Check for silly calling... */
+        if (unlikely(!srl))
+                return drl;
+        if (IS_ERR(srl) || IS_ERR(drl))
+                return ERR_PTR(-EINVAL);
+        /* Check for the case where the first mapping is being done now. */
+        if (unlikely(!drl)) {
+                drl = srl;
+                /* Complete the source runlist if necessary. */
+                if (unlikely(drl[0].vcn)) {
+                        /* Scan to the end of the source runlist. */
+                        for (dend = 0; likely(drl[dend].length); dend++)
+                                ;
+                        drl = ntfs_rl_realloc(drl, dend, dend + 1);
+                        if (IS_ERR(drl))
+                                return drl;
+                        /* Insert start element at the front of the runlist. */
+                        ntfs_rl_mm(drl, 1, 0, dend);
+                        drl[0].vcn = 0;
+                        drl[0].lcn = LCN_RL_NOT_MAPPED;
+                        drl[0].length = drl[1].vcn;
+                }
+                goto finished;
+        }
+        si = di = 0;
+        /* Skip any unmapped start element(s) in the source runlist. */
+        while (srl[si].length && srl[si].lcn < LCN_HOLE)
+                si++;
+        /* Can't have an entirely unmapped source runlist. */
+        BUG_ON(!srl[si].length);
+        /* Record the starting points. */
+        sstart = si;
+        /*
+         * Skip forward in @drl until we reach the position where @srl needs to
+         * be inserted. If we reach the end of @drl, @srl just needs to be
+         * appended to @drl.
+         */
+        for (; drl[di].length; di++) {
+                if (drl[di].vcn + drl[di].length > srl[sstart].vcn)
+                        break;
+        }
+        dins = di;
+        /* Sanity check for illegal overlaps. */
+        if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) &&
+                        (srl[si].lcn >= 0)) {
+                ntfs_error(NULL, "Run lists overlap. Cannot merge!");
+                return ERR_PTR(-ERANGE);
+        }
+        /* Scan to the end of both runlists in order to know their sizes. */
+        for (send = si; srl[send].length; send++)
+                ;
+        for (dend = di; drl[dend].length; dend++)
+                ;
+        if (srl[send].lcn == LCN_ENOENT)
+                marker_vcn = srl[marker = send].vcn;
+        /* Scan to the last element with lcn >= LCN_HOLE. */
+        for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--)
+                ;
+        for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--)
+                ;
+        {
+        BOOL start;
+        BOOL finish;
+        int ds = dend + 1;              /* Number of elements in drl & srl */
+        int ss = sfinal - sstart + 1;
+        start  = ((drl[dins].lcn <  LCN_RL_NOT_MAPPED) ||    /* End of file   */
+                  (drl[dins].vcn == srl[sstart].vcn));       /* Start of hole */
+        finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) &&    /* End of file   */
+                 ((drl[dins].vcn + drl[dins].length) <=      /* End of hole   */
+                  (srl[send - 1].vcn + srl[send - 1].length)));
+        /* Or we'll lose an end marker */
+        if (start && finish && (drl[dins].length == 0))
+                ss++;
+        if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn))
+                finish = FALSE;
+#if 0
+        ntfs_debug("dfinal = %i, dend = %i", dfinal, dend);
+        ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send);
+        ntfs_debug("start = %i, finish = %i", start, finish);
+        ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins);
+#endif
+        if (start) {
+                if (finish)
+                        drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins);
+                else
+                        drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins);
+        } else {
+                if (finish)
+                        drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins);
+                else
+                        drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins);
+        }
+        if (IS_ERR(drl)) {
+                ntfs_error(NULL, "Merge failed.");
+                return drl;
+        }
+        ntfs_free(srl);
+        if (marker) {
+                ntfs_debug("Triggering marker code.");
+                for (ds = dend; drl[ds].length; ds++)
+                        ;
+                /* We only need to care if @srl ended after @drl. */
+                if (drl[ds].vcn <= marker_vcn) {
+                        int slots = 0;
+                        if (drl[ds].vcn == marker_vcn) {
+                                ntfs_debug("Old marker = 0x%llx, replacing "
+                                                "with LCN_ENOENT.",
+                                                (unsigned long long)
+                                                drl[ds].lcn);
+                                drl[ds].lcn = LCN_ENOENT;
+                                goto finished;
+                        }
+                        /*
+                         * We need to create an unmapped runlist element in
+                         * @drl or extend an existing one before adding the
+                         * ENOENT terminator.
+                         */
+                        if (drl[ds].lcn == LCN_ENOENT) {
+                                ds--;
+                                slots = 1;
+                        }
+                        if (drl[ds].lcn != LCN_RL_NOT_MAPPED) {
+                                /* Add an unmapped runlist element. */
+                                if (!slots) {
+                                        /* FIXME/TODO: We need to have the
+                                         * extra memory already! (AIA) */
+                                        drl = ntfs_rl_realloc(drl, ds, ds + 2);
+                                        if (!drl)
+                                                goto critical_error;
+                                        slots = 2;
+                                }
+                                ds++;
+                                /* Need to set vcn if it isn't set already. */
+                                if (slots != 1)
+                                        drl[ds].vcn = drl[ds - 1].vcn +
+                                                        drl[ds - 1].length;
+                                drl[ds].lcn = LCN_RL_NOT_MAPPED;
+                                /* We now used up a slot. */
+                                slots--;
+                        }
+                        drl[ds].length = marker_vcn - drl[ds].vcn;
+                        /* Finally add the ENOENT terminator. */
+                        ds++;
+                        if (!slots) {
+                                /* FIXME/TODO: We need to have the extra
+                                 * memory already! (AIA) */
+                                drl = ntfs_rl_realloc(drl, ds, ds + 1);
+                                if (!drl)
+                                        goto critical_error;
+                        }
+                        drl[ds].vcn = marker_vcn;
+                        drl[ds].lcn = LCN_ENOENT;
+                        drl[ds].length = (s64)0;
+                }
+        }
+        }
+finished:
+        /* The merge was completed successfully. */
+        ntfs_debug("Merged runlist:");
+        ntfs_debug_dump_runlist(drl);
+        return drl;
+critical_error:
+        /* Critical error! We cannot afford to fail here. */
+        ntfs_error(NULL, "Critical error! Not enough memory.");
+        panic("NTFS: Cannot continue.");
+}
+/**
+ * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist
+ * @vol:        ntfs volume on which the attribute resides
+ * @attr:       attribute record whose mapping pairs array to decompress
+ * @old_rl:     optional runlist in which to insert @attr's runlist
+ *
+ * It is up to the caller to serialize access to the runlist @old_rl.
+ *
+ * Decompress the attribute @attr's mapping pairs array into a runlist. On
+ * success, return the decompressed runlist.
+ *
+ * If @old_rl is not NULL, decompressed runlist is inserted into the
+ * appropriate place in @old_rl and the resultant, combined runlist is
+ * returned. The original @old_rl is deallocated.
+ *
+ * On error, return -errno. @old_rl is left unmodified in that case.
+ *
+ * The following error codes are defined:
+ *      -ENOMEM - Not enough memory to allocate runlist array.
+ *      -EIO    - Corrupt runlist.
+ *      -EINVAL - Invalid parameters were passed in.
+ *      -ERANGE - The two runlists overlap.
+ *
+ * FIXME: For now we take the conceptionally simplest approach of creating the
+ * new runlist disregarding the already existing one and then splicing the
+ * two into one, if that is possible (we check for overlap and discard the new
+ * runlist if overlap present before returning ERR_PTR(-ERANGE)).
+ */
+runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
+                const ATTR_RECORD *attr, runlist_element *old_rl)
+{
+        VCN vcn;                /* Current vcn. */
+        LCN lcn;                /* Current lcn. */
+        s64 deltaxcn;           /* Change in [vl]cn. */
+        runlist_element *rl;    /* The output runlist. */
+        u8 *buf;                /* Current position in mapping pairs array. */
+        u8 *attr_end;           /* End of attribute. */
+        int rlsize;             /* Size of runlist buffer. */
+        u16 rlpos;              /* Current runlist position in units of
+                                   runlist_elements. */
+        u8 b;                   /* Current byte offset in buf. */
+#ifdef DEBUG
+        /* Make sure attr exists and is non-resident. */
+        if (!attr || !attr->non_resident || sle64_to_cpu(
+                        attr->data.non_resident.lowest_vcn) < (VCN)0) {
+                ntfs_error(vol->sb, "Invalid arguments.");
+                return ERR_PTR(-EINVAL);
+        }
+#endif
+        /* Start at vcn = lowest_vcn and lcn 0. */
+        vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn);
+        lcn = 0;
+        /* Get start of the mapping pairs array. */
+        buf = (u8*)attr + le16_to_cpu(
+                        attr->data.non_resident.mapping_pairs_offset);
+        attr_end = (u8*)attr + le32_to_cpu(attr->length);
+        if (unlikely(buf < (u8*)attr || buf > attr_end)) {
+                ntfs_error(vol->sb, "Corrupt attribute.");
+                return ERR_PTR(-EIO);
+        }
+        /* Current position in runlist array. */
+        rlpos = 0;
+        /* Allocate first page and set current runlist size to one page. */
+        rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE);
+        if (unlikely(!rl))
+                return ERR_PTR(-ENOMEM);
+        /* Insert unmapped starting element if necessary. */
+        if (vcn) {
+                rl->vcn = 0;
+                rl->lcn = LCN_RL_NOT_MAPPED;
+                rl->length = vcn;
+                rlpos++;
+        }
+        while (buf < attr_end && *buf) {
+                /*
+                 * Allocate more memory if needed, including space for the
+                 * not-mapped and terminator elements. ntfs_malloc_nofs()
+                 * operates on whole pages only.
+                 */
+                if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) {
+                        runlist_element *rl2;
+                        rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
+                        if (unlikely(!rl2)) {
+                                ntfs_free(rl);
+                                return ERR_PTR(-ENOMEM);
+                        }
+                        memcpy(rl2, rl, rlsize);
+                        ntfs_free(rl);
+                        rl = rl2;
+                        rlsize += PAGE_SIZE;
+                }
+                /* Enter the current vcn into the current runlist element. */
+                rl[rlpos].vcn = vcn;
+                /*
+                 * Get the change in vcn, i.e. the run length in clusters.
+                 * Doing it this way ensures that we signextend negative values.
+                 * A negative run length doesn't make any sense, but hey, I
+                 * didn't make up the NTFS specs and Windows NT4 treats the run
+                 * length as a signed value so that's how it is...
+                 */
+                b = *buf & 0xf;
+                if (b) {
+                        if (unlikely(buf + b > attr_end))
+                                goto io_error;
+                        for (deltaxcn = (s8)buf[b--]; b; b--)
+                                deltaxcn = (deltaxcn << 8) + buf[b];
+                } else { /* The length entry is compulsory. */
+                        ntfs_error(vol->sb, "Missing length entry in mapping "
+                                        "pairs array.");
+                        deltaxcn = (s64)-1;
+                }
+                /*
+                 * Assume a negative length to indicate data corruption and
+                 * hence clean-up and return NULL.
+                 */
+                if (unlikely(deltaxcn < 0)) {
+                        ntfs_error(vol->sb, "Invalid length in mapping pairs "
+                                        "array.");
+                        goto err_out;
+                }
+                /*
+                 * Enter the current run length into the current runlist
+                 * element.
+                 */
+                rl[rlpos].length = deltaxcn;
+                /* Increment the current vcn by the current run length. */
+                vcn += deltaxcn;
+                /*
+                 * There might be no lcn change at all, as is the case for
+                 * sparse clusters on NTFS 3.0+, in which case we set the lcn
+                 * to LCN_HOLE.
+                 */
+                if (!(*buf & 0xf0))
+                        rl[rlpos].lcn = LCN_HOLE;
+                else {
+                        /* Get the lcn change which really can be negative. */
+                        u8 b2 = *buf & 0xf;
+                        b = b2 + ((*buf >> 4) & 0xf);
+                        if (buf + b > attr_end)
+                                goto io_error;
+                        for (deltaxcn = (s8)buf[b--]; b > b2; b--)
+                                deltaxcn = (deltaxcn << 8) + buf[b];
+                        /* Change the current lcn to its new value. */
+                        lcn += deltaxcn;
+#ifdef DEBUG
+                        /*
+                         * On NTFS 1.2-, apparently can have lcn == -1 to
+                         * indicate a hole. But we haven't verified ourselves
+                         * whether it is really the lcn or the deltaxcn that is
+                         * -1. So if either is found give us a message so we
+                         * can investigate it further!
+                         */
+                        if (vol->major_ver < 3) {
+                                if (unlikely(deltaxcn == (LCN)-1))
+                                        ntfs_error(vol->sb, "lcn delta == -1");
+                                if (unlikely(lcn == (LCN)-1))
+                                        ntfs_error(vol->sb, "lcn == -1");
+                        }
+#endif
+                        /* Check lcn is not below -1. */
+                        if (unlikely(lcn < (LCN)-1)) {
+                                ntfs_error(vol->sb, "Invalid LCN < -1 in "
+                                                "mapping pairs array.");
+                                goto err_out;
+                        }
+                        /* Enter the current lcn into the runlist element. */
+                        rl[rlpos].lcn = lcn;
+                }
+                /* Get to the next runlist element. */
+                rlpos++;
+                /* Increment the buffer position to the next mapping pair. */
+                buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1;
+        }
+        if (unlikely(buf >= attr_end))
+                goto io_error;
+        /*
+         * If there is a highest_vcn specified, it must be equal to the final
+         * vcn in the runlist - 1, or something has gone badly wrong.
+         */
+        deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn);
+        if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) {
+mpa_err:
+                ntfs_error(vol->sb, "Corrupt mapping pairs array in "
+                                "non-resident attribute.");
+                goto err_out;
+        }
+        /* Setup not mapped runlist element if this is the base extent. */
+        if (!attr->data.non_resident.lowest_vcn) {
+                VCN max_cluster;
+                max_cluster = (sle64_to_cpu(
+                                attr->data.non_resident.allocated_size) +
+                                vol->cluster_size - 1) >>
+                                vol->cluster_size_bits;
+                /*
+                 * If there is a difference between the highest_vcn and the
+                 * highest cluster, the runlist is either corrupt or, more
+                 * likely, there are more extents following this one.
+                 */
+                if (deltaxcn < --max_cluster) {
+                        ntfs_debug("More extents to follow; deltaxcn = 0x%llx, "
+                                        "max_cluster = 0x%llx",
+                                        (unsigned long long)deltaxcn,
+                                        (unsigned long long)max_cluster);
+                        rl[rlpos].vcn = vcn;
+                        vcn += rl[rlpos].length = max_cluster - deltaxcn;
+                        rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
+                        rlpos++;
+                } else if (unlikely(deltaxcn > max_cluster)) {
+                        ntfs_error(vol->sb, "Corrupt attribute. deltaxcn = "
+                                        "0x%llx, max_cluster = 0x%llx",
+                                        (unsigned long long)deltaxcn,
+                                        (unsigned long long)max_cluster);
+                        goto mpa_err;
+                }
+                rl[rlpos].lcn = LCN_ENOENT;
+        } else /* Not the base extent. There may be more extents to follow. */
+                rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
+        /* Setup terminating runlist element. */
+        rl[rlpos].vcn = vcn;
+        rl[rlpos].length = (s64)0;
+        /* If no existing runlist was specified, we are done. */
+        if (!old_rl) {
+                ntfs_debug("Mapping pairs array successfully decompressed:");
+                ntfs_debug_dump_runlist(rl);
+                return rl;
+        }
+        /* Now combine the new and old runlists checking for overlaps. */
+        old_rl = ntfs_runlists_merge(old_rl, rl);
+        if (likely(!IS_ERR(old_rl)))
+                return old_rl;
+        ntfs_free(rl);
+        ntfs_error(vol->sb, "Failed to merge runlists.");
+        return old_rl;
+io_error:
+        ntfs_error(vol->sb, "Corrupt attribute.");
+err_out:
+        ntfs_free(rl);
+        return ERR_PTR(-EIO);
+}
+/**
+ * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist
+ * @rl:         runlist to use for conversion
+ * @vcn:        vcn to convert
+ *
+ * Convert the virtual cluster number @vcn of an attribute into a logical
+ * cluster number (lcn) of a device using the runlist @rl to map vcns to their
+ * corresponding lcns.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * Since lcns must be >= 0, we use negative return values with special meaning:
+ *
+ * Return value                 Meaning / Description
+ * ==================================================
+ *  -1 = LCN_HOLE               Hole / not allocated on disk.
+ *  -2 = LCN_RL_NOT_MAPPED      This is part of the runlist which has not been
+ *                              inserted into the runlist yet.
+ *  -3 = LCN_ENOENT             There is no such vcn in the attribute.
+ *
+ * Locking: - The caller must have locked the runlist (for reading or writing).
+ *          - This function does not touch the lock.
+ */
+LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn)
+{
+        int i;
+        BUG_ON(vcn < 0);
+        /*
+         * If rl is NULL, assume that we have found an unmapped runlist. The
+         * caller can then attempt to map it and fail appropriately if
+         * necessary.
+         */
+        if (unlikely(!rl))
+                return LCN_RL_NOT_MAPPED;
+        /* Catch out of lower bounds vcn. */
+        if (unlikely(vcn < rl[0].vcn))
+                return LCN_ENOENT;
+        for (i = 0; likely(rl[i].length); i++) {
+                if (unlikely(vcn < rl[i+1].vcn)) {
+                        if (likely(rl[i].lcn >= (LCN)0))
+                                return rl[i].lcn + (vcn - rl[i].vcn);
+                        return rl[i].lcn;
+                }
+        }
+        /*
+         * The terminator element is setup to the correct value, i.e. one of
+         * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT.
+         */
+        if (likely(rl[i].lcn < (LCN)0))
+                return rl[i].lcn;
+        /* Just in case... We could replace this with BUG() some day. */
+        return LCN_ENOENT;
+}
+/**
+ * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number
+ * @n:          number for which to get the number of bytes for
+ *
+ * Return the number of bytes required to store @n unambiguously as
+ * a signed number.
+ *
+ * This is used in the context of the mapping pairs array to determine how
+ * many bytes will be needed in the array to store a given logical cluster
+ * number (lcn) or a specific run length.
+ *
+ * Return the number of bytes written.  This function cannot fail.
+ */
+static inline int ntfs_get_nr_significant_bytes(const s64 n)
+{
+        s64 l = n;
+        int i;
+        s8 j;
+        i = 0;
+        do {
+                l >>= 8;
+                i++;
+        } while (l != 0 && l != -1);
+        j = (n >> 8 * (i - 1)) & 0xff;
+        /* If the sign bit is wrong, we need an extra byte. */
+        if ((n < 0 && j >= 0) || (n > 0 && j < 0))
+                i++;
+        return i;
+}
+/**
+ * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array
+ * @vol:        ntfs volume (needed for the ntfs version)
+ * @rl:         locked runlist to determine the size of the mapping pairs of
+ * @start_vcn:  vcn at which to start the mapping pairs array
+ *
+ * Walk the locked runlist @rl and calculate the size in bytes of the mapping
+ * pairs array corresponding to the runlist @rl, starting at vcn @start_vcn.
+ * This for example allows us to allocate a buffer of the right size when
+ * building the mapping pairs array.
+ *
+ * If @rl is NULL, just return 1 (for the single terminator byte).
+ *
+ * Return the calculated size in bytes on success.  On error, return -errno.
+ * The following error codes are defined:
+ *      -EINVAL - Run list contains unmapped elements.  Make sure to only pass
+ *                fully mapped runlists to this function.
+ *      -EIO    - The runlist is corrupt.
+ *
+ * Locking: @rl must be locked on entry (either for reading or writing), it
+ *          remains locked throughout, and is left locked upon return.
+ */
+int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
+                const runlist_element *rl, const VCN start_vcn)
+{
+        LCN prev_lcn;
+        int rls;
+        BUG_ON(start_vcn < 0);
+        if (!rl) {
+                BUG_ON(start_vcn);
+                return 1;
+        }
+        /* Skip to runlist element containing @start_vcn. */
+        while (rl->length && start_vcn >= rl[1].vcn)
+                rl++;
+        if ((!rl->length && start_vcn > rl->vcn) || start_vcn < rl->vcn)
+                return -EINVAL;
+        prev_lcn = 0;
+        /* Always need the termining zero byte. */
+        rls = 1;
+        /* Do the first partial run if present. */
+        if (start_vcn > rl->vcn) {
+                s64 delta;
+                /* We know rl->length != 0 already. */
+                if (rl->length < 0 || rl->lcn < LCN_HOLE)
+                        goto err_out;
+                delta = start_vcn - rl->vcn;
+                /* Header byte + length. */
+                rls += 1 + ntfs_get_nr_significant_bytes(rl->length - delta);
+                /*
+                 * If the logical cluster number (lcn) denotes a hole and we
+                 * are on NTFS 3.0+, we don't store it at all, i.e. we need
+                 * zero space.  On earlier NTFS versions we just store the lcn.
+                 * Note: this assumes that on NTFS 1.2-, holes are stored with
+                 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
+                 */
+                if (rl->lcn >= 0 || vol->major_ver < 3) {
+                        prev_lcn = rl->lcn;
+                        if (rl->lcn >= 0)
+                                prev_lcn += delta;
+                        /* Change in lcn. */
+                        rls += ntfs_get_nr_significant_bytes(prev_lcn);
+                }
+                /* Go to next runlist element. */
+                rl++;
+        }
+        /* Do the full runs. */
+        for (; rl->length; rl++) {
+                if (rl->length < 0 || rl->lcn < LCN_HOLE)
+                        goto err_out;
+                /* Header byte + length. */
+                rls += 1 + ntfs_get_nr_significant_bytes(rl->length);
+                /*
+                 * If the logical cluster number (lcn) denotes a hole and we
+                 * are on NTFS 3.0+, we don't store it at all, i.e. we need
+                 * zero space.  On earlier NTFS versions we just store the lcn.
+                 * Note: this assumes that on NTFS 1.2-, holes are stored with
+                 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
+                 */
+                if (rl->lcn >= 0 || vol->major_ver < 3) {
+                        /* Change in lcn. */
+                        rls += ntfs_get_nr_significant_bytes(rl->lcn -
+                                        prev_lcn);
+                        prev_lcn = rl->lcn;
+                }
+        }
+        return rls;
+err_out:
+        if (rl->lcn == LCN_RL_NOT_MAPPED)
+                rls = -EINVAL;
+        else
+                rls = -EIO;
+        return rls;
+}
+/**
+ * ntfs_write_significant_bytes - write the significant bytes of a number
+ * @dst:        destination buffer to write to
+ * @dst_max:    pointer to last byte of destination buffer for bounds checking
+ * @n:          number whose significant bytes to write
+ *
+ * Store in @dst, the minimum bytes of the number @n which are required to
+ * identify @n unambiguously as a signed number, taking care not to exceed
+ * @dest_max, the maximum position within @dst to which we are allowed to
+ * write.
+ *
+ * This is used when building the mapping pairs array of a runlist to compress
+ * a given logical cluster number (lcn) or a specific run length to the minumum
+ * size possible.
+ *
+ * Return the number of bytes written on success.  On error, i.e. the
+ * destination buffer @dst is too small, return -ENOSPC.
+ */
+static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max,
+                const s64 n)
+{
+        s64 l = n;
+        int i;
+        s8 j;
+        i = 0;
+        do {
+                if (dst > dst_max)
+                        goto err_out;
+                *dst++ = l & 0xffll;
+                l >>= 8;
+                i++;
+        } while (l != 0 && l != -1);
+        j = (n >> 8 * (i - 1)) & 0xff;
+        /* If the sign bit is wrong, we need an extra byte. */
+        if (n < 0 && j >= 0) {
+                if (dst > dst_max)
+                        goto err_out;
+                i++;
+                *dst = (s8)-1;
+        } else if (n > 0 && j < 0) {
+                if (dst > dst_max)
+                        goto err_out;
+                i++;
+                *dst = (s8)0;
+        }
+        return i;
+err_out:
+        return -ENOSPC;
+}
+/**
+ * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist
+ * @vol:        ntfs volume (needed for the ntfs version)
+ * @dst:        destination buffer to which to write the mapping pairs array
+ * @dst_len:    size of destination buffer @dst in bytes
+ * @rl:         locked runlist for which to build the mapping pairs array
+ * @start_vcn:  vcn at which to start the mapping pairs array
+ * @stop_vcn:   first vcn outside destination buffer on success or -ENOSPC
+ *
+ * Create the mapping pairs array from the locked runlist @rl, starting at vcn
+ * @start_vcn and save the array in @dst.  @dst_len is the size of @dst in
+ * bytes and it should be at least equal to the value obtained by calling
+ * ntfs_get_size_for_mapping_pairs().
+ *
+ * If @rl is NULL, just write a single terminator byte to @dst.
+ *
+ * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to
+ * the first vcn outside the destination buffer.  Note that on error, @dst has
+ * been filled with all the mapping pairs that will fit, thus it can be treated
+ * as partial success, in that a new attribute extent needs to be created or
+ * the next extent has to be used and the mapping pairs build has to be
+ * continued with @start_vcn set to *@stop_vcn.
+ *
+ * Return 0 on success and -errno on error.  The following error codes are
+ * defined:
+ *      -EINVAL - Run list contains unmapped elements.  Make sure to only pass
+ *                fully mapped runlists to this function.
+ *      -EIO    - The runlist is corrupt.
+ *      -ENOSPC - The destination buffer is too small.
+ *
+ * Locking: @rl must be locked on entry (either for reading or writing), it
+ *          remains locked throughout, and is left locked upon return.
+ */
+int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
+                const int dst_len, const runlist_element *rl,
+                const VCN start_vcn, VCN *const stop_vcn)
+{
+        LCN prev_lcn;
+        s8 *dst_max, *dst_next;
+        int err = -ENOSPC;
+        s8 len_len, lcn_len;
+        BUG_ON(start_vcn < 0);
+        BUG_ON(dst_len < 1);
+        if (!rl) {
+                BUG_ON(start_vcn);
+                if (stop_vcn)
+                        *stop_vcn = 0;
+                /* Terminator byte. */
+                *dst = 0;
+                return 0;
+        }
+        /* Skip to runlist element containing @start_vcn. */
+        while (rl->length && start_vcn >= rl[1].vcn)
+                rl++;
+        if ((!rl->length && start_vcn > rl->vcn) || start_vcn < rl->vcn)
+                return -EINVAL;
+        /*
+         * @dst_max is used for bounds checking in
+         * ntfs_write_significant_bytes().
+         */
+        dst_max = dst + dst_len - 1;
+        prev_lcn = 0;
+        /* Do the first partial run if present. */
+        if (start_vcn > rl->vcn) {
+                s64 delta;
+                /* We know rl->length != 0 already. */
+                if (rl->length < 0 || rl->lcn < LCN_HOLE)
+                        goto err_out;
+                delta = start_vcn - rl->vcn;
+                /* Write length. */
+                len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
+                                rl->length - delta);
+                if (len_len < 0)
+                        goto size_err;
+                /*
+                 * If the logical cluster number (lcn) denotes a hole and we
+                 * are on NTFS 3.0+, we don't store it at all, i.e. we need
+                 * zero space.  On earlier NTFS versions we just write the lcn
+                 * change.  FIXME: Do we need to write the lcn change or just
+                 * the lcn in that case?  Not sure as I have never seen this
+                 * case on NT4. - We assume that we just need to write the lcn
+                 * change until someone tells us otherwise... (AIA)
+                 */
+                if (rl->lcn >= 0 || vol->major_ver < 3) {
+                        prev_lcn = rl->lcn;
+                        if (rl->lcn >= 0)
+                                prev_lcn += delta;
+                        /* Write change in lcn. */
+                        lcn_len = ntfs_write_significant_bytes(dst + 1 +
+                                        len_len, dst_max, prev_lcn);
+                        if (lcn_len < 0)
+                                goto size_err;
+                } else
+                        lcn_len = 0;
+                dst_next = dst + len_len + lcn_len + 1;
+                if (dst_next > dst_max)
+                        goto size_err;
+                /* Update header byte. */
+                *dst = lcn_len << 4 | len_len;
+                /* Position at next mapping pairs array element. */
+                dst = dst_next;
+                /* Go to next runlist element. */
+                rl++;
+        }
+        /* Do the full runs. */
+        for (; rl->length; rl++) {
+                if (rl->length < 0 || rl->lcn < LCN_HOLE)
+                        goto err_out;
+                /* Write length. */
+                len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
+                                rl->length);
+                if (len_len < 0)
+                        goto size_err;
+                /*
+                 * If the logical cluster number (lcn) denotes a hole and we
+                 * are on NTFS 3.0+, we don't store it at all, i.e. we need
+                 * zero space.  On earlier NTFS versions we just write the lcn
+                 * change.  FIXME: Do we need to write the lcn change or just
+                 * the lcn in that case?  Not sure as I have never seen this
+                 * case on NT4. - We assume that we just need to write the lcn
+                 * change until someone tells us otherwise... (AIA)
+                 */
+                if (rl->lcn >= 0 || vol->major_ver < 3) {
+                        /* Write change in lcn. */
+                        lcn_len = ntfs_write_significant_bytes(dst + 1 +
+                                        len_len, dst_max, rl->lcn - prev_lcn);
+                        if (lcn_len < 0)
+                                goto size_err;
+                        prev_lcn = rl->lcn;
+                } else
+                        lcn_len = 0;
+                dst_next = dst + len_len + lcn_len + 1;
+                if (dst_next > dst_max)
+                        goto size_err;
+                /* Update header byte. */
+                *dst = lcn_len << 4 | len_len;
+                /* Position at next mapping pairs array element. */
+                dst = dst_next;
+        }
+        /* Success. */
+        err = 0;
+size_err:
+        /* Set stop vcn. */
+        if (stop_vcn)
+                *stop_vcn = rl->vcn;
+        /* Add terminator byte. */
+        *dst = 0;
+        return err;
+err_out:
+        if (rl->lcn == LCN_RL_NOT_MAPPED)
+                err = -EINVAL;
+        else
+                err = -EIO;
+        return err;
+}
+/**
+ * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn
+ * @runlist:    runlist to truncate
+ * @new_length: the new length of the runlist in VCNs
+ *
+ * Truncate the runlist described by @runlist as well as the memory buffer
+ * holding the runlist elements to a length of @new_length VCNs.
+ *
+ * If @new_length lies within the runlist, the runlist elements with VCNs of
+ * @new_length and above are discarded.
+ *
+ * If @new_length lies beyond the runlist, a sparse runlist element is added to
+ * the end of the runlist @runlist or if the last runlist element is a sparse
+ * one already, this is extended.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: The caller must hold @runlist->lock for writing.
+ */
+int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
+                const s64 new_length)
+{
+        runlist_element *rl;
+        int old_size;
+        ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length);
+        BUG_ON(!runlist);
+        BUG_ON(new_length < 0);
+        rl = runlist->rl;
+        if (unlikely(!rl)) {
+                /*
+                 * Create a runlist consisting of a sparse runlist element of
+                 * length @new_length followed by a terminator runlist element.
+                 */
+                rl = ntfs_malloc_nofs(PAGE_SIZE);
+                if (unlikely(!rl)) {
+                        ntfs_error(vol->sb, "Not enough memory to allocate "
+                                        "runlist element buffer.");
+                        return -ENOMEM;
+                }
+                runlist->rl = rl;
+                rl[1].length = rl->vcn = 0;
+                rl->lcn = LCN_HOLE;
+                rl[1].vcn = rl->length = new_length;
+                rl[1].lcn = LCN_ENOENT;
+                return 0;
+        }
+        BUG_ON(new_length < rl->vcn);
+        /* Find @new_length in the runlist. */
+        while (likely(rl->length && new_length >= rl[1].vcn))
+                rl++;
+        /*
+         * If not at the end of the runlist we need to shrink it.
+         * If at the end of the runlist we need to expand it.
+         */
+        if (rl->length) {
+                runlist_element *trl;
+                BOOL is_end;
+                ntfs_debug("Shrinking runlist.");
+                /* Determine the runlist size. */
+                trl = rl + 1;
+                while (likely(trl->length))
+                        trl++;
+                old_size = trl - runlist->rl + 1;
+                /* Truncate the run. */
+                rl->length = new_length - rl->vcn;
+                /*
+                 * If a run was partially truncated, make the following runlist
+                 * element a terminator.
+                 */
+                is_end = FALSE;
+                if (rl->length) {
+                        rl++;
+                        if (!rl->length)
+                                is_end = TRUE;
+                        rl->vcn = new_length;
+                        rl->length = 0;
+                }
+                rl->lcn = LCN_ENOENT;
+                /* Reallocate memory if necessary. */
+                if (!is_end) {
+                        int new_size = rl - runlist->rl + 1;
+                        rl = ntfs_rl_realloc(runlist->rl, old_size, new_size);
+                        if (IS_ERR(rl))
+                                ntfs_warning(vol->sb, "Failed to shrink "
+                                                "runlist buffer.  This just "
+                                                "wastes a bit of memory "
+                                                "temporarily so we ignore it "
+                                                "and return success.");
+                        else
+                                runlist->rl = rl;
+                }
+        } else if (likely(/* !rl->length && */ new_length > rl->vcn)) {
+                ntfs_debug("Expanding runlist.");
+                /*
+                 * If there is a previous runlist element and it is a sparse
+                 * one, extend it.  Otherwise need to add a new, sparse runlist
+                 * element.
+                 */
+                if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE))
+                        (rl - 1)->length = new_length - (rl - 1)->vcn;
+                else {
+                        /* Determine the runlist size. */
+                        old_size = rl - runlist->rl + 1;
+                        /* Reallocate memory if necessary. */
+                        rl = ntfs_rl_realloc(runlist->rl, old_size,
+                                        old_size + 1);
+                        if (IS_ERR(rl)) {
+                                ntfs_error(vol->sb, "Failed to expand runlist "
+                                                "buffer, aborting.");
+                                return PTR_ERR(rl);
+                        }
+                        runlist->rl = rl;
+                        /*
+                         * Set @rl to the same runlist element in the new
+                         * runlist as before in the old runlist.
+                         */
+                        rl += old_size - 1;
+                        /* Add a new, sparse runlist element. */
+                        rl->lcn = LCN_HOLE;
+                        rl->length = new_length - rl->vcn;
+                        /* Add a new terminator runlist element. */
+                        rl++;
+                        rl->length = 0;
+                }
+                rl->vcn = new_length;
+                rl->lcn = LCN_ENOENT;
+        } else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ {
+                /* Runlist already has same size as requested. */
+                rl->lcn = LCN_ENOENT;
+        }
+        ntfs_debug("Done.");
+        return 0;
+}
diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h
new file mode 100644
index 000000000000..7107fde59df9
--- /dev/null
+++ b/fs/ntfs/runlist.h
@@ -0,0 +1,89 @@
+/*
+ * runlist.h - Defines for runlist handling in NTFS Linux kernel driver.
+ *             Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_RUNLIST_H
+#define _LINUX_NTFS_RUNLIST_H
+#include "types.h"
+#include "layout.h"
+#include "volume.h"
+/**
+ * runlist_element - in memory vcn to lcn mapping array element
+ * @vcn:        starting vcn of the current array element
+ * @lcn:        starting lcn of the current array element
+ * @length:     length in clusters of the current array element
+ *
+ * The last vcn (in fact the last vcn + 1) is reached when length == 0.
+ *
+ * When lcn == -1 this means that the count vcns starting at vcn are not
+ * physically allocated (i.e. this is a hole / data is sparse).
+ */
+typedef struct {        /* In memory vcn to lcn mapping structure element. */
+        VCN vcn;        /* vcn = Starting virtual cluster number. */
+        LCN lcn;        /* lcn = Starting logical cluster number. */
+        s64 length;     /* Run length in clusters. */
+} runlist_element;
+/**
+ * runlist - in memory vcn to lcn mapping array including a read/write lock
+ * @rl:         pointer to an array of runlist elements
+ * @lock:       read/write spinlock for serializing access to @rl
+ *
+ */
+typedef struct {
+        runlist_element *rl;
+        struct rw_semaphore lock;
+} runlist;
+static inline void ntfs_init_runlist(runlist *rl)
+{
+        rl->rl = NULL;
+        init_rwsem(&rl->lock);
+}
+typedef enum {
+        LCN_HOLE                = -1,   /* Keep this as highest value or die! */
+        LCN_RL_NOT_MAPPED       = -2,
+        LCN_ENOENT              = -3,
+} LCN_SPECIAL_VALUES;
+extern runlist_element *ntfs_runlists_merge(runlist_element *drl,
+                runlist_element *srl);
+extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
+                const ATTR_RECORD *attr, runlist_element *old_rl);
+extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn);
+extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
+                const runlist_element *rl, const VCN start_vcn);
+extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
+                const int dst_len, const runlist_element *rl,
+                const VCN start_vcn, VCN *const stop_vcn);
+extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol,
+                runlist *const runlist, const s64 new_length);
+#endif /* _LINUX_NTFS_RUNLIST_H */
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
new file mode 100644
index 000000000000..212a3d0f2073
--- /dev/null
+++ b/fs/ntfs/super.c
@@ -0,0 +1,2771 @@
+/*
+ * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001,2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/stddef.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>       /* For bdev_hardsect_size(). */
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/moduleparam.h>
+#include <linux/smp_lock.h>
+#include "sysctl.h"
+#include "logfile.h"
+#include "quota.h"
+#include "dir.h"
+#include "debug.h"
+#include "index.h"
+#include "aops.h"
+#include "malloc.h"
+#include "ntfs.h"
+/* Number of mounted file systems which have compression enabled. */
+static unsigned long ntfs_nr_compression_users;
+/* A global default upcase table and a corresponding reference count. */
+static ntfschar *default_upcase = NULL;
+static unsigned long ntfs_nr_upcase_users = 0;
+/* Error constants/strings used in inode.c::ntfs_show_options(). */
+typedef enum {
+        /* One of these must be present, default is ON_ERRORS_CONTINUE. */
+        ON_ERRORS_PANIC                 = 0x01,
+        ON_ERRORS_REMOUNT_RO            = 0x02,
+        ON_ERRORS_CONTINUE              = 0x04,
+        /* Optional, can be combined with any of the above. */
+        ON_ERRORS_RECOVER               = 0x10,
+} ON_ERRORS_ACTIONS;
+const option_t on_errors_arr[] = {
+        { ON_ERRORS_PANIC,      "panic" },
+        { ON_ERRORS_REMOUNT_RO, "remount-ro", },
+        { ON_ERRORS_CONTINUE,   "continue", },
+        { ON_ERRORS_RECOVER,    "recover" },
+        { 0,                    NULL }
+};
+/**
+ * simple_getbool -
+ *
+ * Copied from old ntfs driver (which copied from vfat driver).
+ */
+static int simple_getbool(char *s, BOOL *setval)
+{
+        if (s) {
+                if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true"))
+                        *setval = TRUE;
+                else if (!strcmp(s, "0") || !strcmp(s, "no") ||
+                                                        !strcmp(s, "false"))
+                        *setval = FALSE;
+                else
+                        return 0;
+        } else
+                *setval = TRUE;
+        return 1;
+}
+/**
+ * parse_options - parse the (re)mount options
+ * @vol:        ntfs volume
+ * @opt:        string containing the (re)mount options
+ *
+ * Parse the recognized options in @opt for the ntfs volume described by @vol.
+ */
+static BOOL parse_options(ntfs_volume *vol, char *opt)
+{
+        char *p, *v, *ov;
+        static char *utf8 = "utf8";
+        int errors = 0, sloppy = 0;
+        uid_t uid = (uid_t)-1;
+        gid_t gid = (gid_t)-1;
+        mode_t fmask = (mode_t)-1, dmask = (mode_t)-1;
+        int mft_zone_multiplier = -1, on_errors = -1;
+        int show_sys_files = -1, case_sensitive = -1;
+        struct nls_table *nls_map = NULL, *old_nls;
+        /* I am lazy... (-8 */
+#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value)       \
+        if (!strcmp(p, option)) {                                       \
+                if (!v || !*v)                                          \
+                        variable = default_value;                       \
+                else {                                                  \
+                        variable = simple_strtoul(ov = v, &v, 0);       \
+                        if (*v)                                         \
+                                goto needs_val;                         \
+                }                                                       \
+        }
+#define NTFS_GETOPT(option, variable)                                   \
+        if (!strcmp(p, option)) {                                       \
+                if (!v || !*v)                                          \
+                        goto needs_arg;                                 \
+                variable = simple_strtoul(ov = v, &v, 0);               \
+                if (*v)                                                 \
+                        goto needs_val;                                 \
+        }
+#define NTFS_GETOPT_BOOL(option, variable)                              \
+        if (!strcmp(p, option)) {                                       \
+                BOOL val;                                               \
+                if (!simple_getbool(v, &val))                           \
+                        goto needs_bool;                                \
+                variable = val;                                         \
+        }
+#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array)          \
+        if (!strcmp(p, option)) {                                       \
+                int _i;                                                 \
+                if (!v || !*v)                                          \
+                        goto needs_arg;                                 \
+                ov = v;                                                 \
+                if (variable == -1)                                     \
+                        variable = 0;                                   \
+                for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \
+                        if (!strcmp(opt_array[_i].str, v)) {            \
+                                variable |= opt_array[_i].val;          \
+                                break;                                  \
+                        }                                               \
+                if (!opt_array[_i].str || !*opt_array[_i].str)          \
+                        goto needs_val;                                 \
+        }
+        if (!opt || !*opt)
+                goto no_mount_options;
+        ntfs_debug("Entering with mount options string: %s", opt);
+        while ((p = strsep(&opt, ","))) {
+                if ((v = strchr(p, '=')))
+                        *v++ = 0;
+                NTFS_GETOPT("uid", uid)
+                else NTFS_GETOPT("gid", gid)
+                else NTFS_GETOPT("umask", fmask = dmask)
+                else NTFS_GETOPT("fmask", fmask)
+                else NTFS_GETOPT("dmask", dmask)
+                else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier)
+                else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, TRUE)
+                else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files)
+                else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive)
+                else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors,
+                                on_errors_arr)
+                else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes"))
+                        ntfs_warning(vol->sb, "Ignoring obsolete option %s.",
+                                        p);
+                else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) {
+                        if (!strcmp(p, "iocharset"))
+                                ntfs_warning(vol->sb, "Option iocharset is "
+                                                "deprecated. Please use "
+                                                "option nls=<charsetname> in "
+                                                "the future.");
+                        if (!v || !*v)
+                                goto needs_arg;
+use_utf8:
+                        old_nls = nls_map;
+                        nls_map = load_nls(v);
+                        if (!nls_map) {
+                                if (!old_nls) {
+                                        ntfs_error(vol->sb, "NLS character set "
+                                                        "%s not found.", v);
+                                        return FALSE;
+                                }
+                                ntfs_error(vol->sb, "NLS character set %s not "
+                                                "found. Using previous one %s.",
+                                                v, old_nls->charset);
+                                nls_map = old_nls;
+                        } else /* nls_map */ {
+                                if (old_nls)
+                                        unload_nls(old_nls);
+                        }
+                } else if (!strcmp(p, "utf8")) {
+                        BOOL val = FALSE;
+                        ntfs_warning(vol->sb, "Option utf8 is no longer "
+                                   "supported, using option nls=utf8. Please "
+                                   "use option nls=utf8 in the future and "
+                                   "make sure utf8 is compiled either as a "
+                                   "module or into the kernel.");
+                        if (!v || !*v)
+                                val = TRUE;
+                        else if (!simple_getbool(v, &val))
+                                goto needs_bool;
+                        if (val) {
+                                v = utf8;
+                                goto use_utf8;
+                        }
+                } else {
+                        ntfs_error(vol->sb, "Unrecognized mount option %s.", p);
+                        if (errors < INT_MAX)
+                                errors++;
+                }
+#undef NTFS_GETOPT_OPTIONS_ARRAY
+#undef NTFS_GETOPT_BOOL
+#undef NTFS_GETOPT
+#undef NTFS_GETOPT_WITH_DEFAULT
+        }
+no_mount_options:
+        if (errors && !sloppy)
+                return FALSE;
+        if (sloppy)
+                ntfs_warning(vol->sb, "Sloppy option given. Ignoring "
+                                "unrecognized mount option(s) and continuing.");
+        /* Keep this first! */
+        if (on_errors != -1) {
+                if (!on_errors) {
+                        ntfs_error(vol->sb, "Invalid errors option argument "
+                                        "or bug in options parser.");
+                        return FALSE;
+                }
+        }
+        if (nls_map) {
+                if (vol->nls_map && vol->nls_map != nls_map) {
+                        ntfs_error(vol->sb, "Cannot change NLS character set "
+                                        "on remount.");
+                        return FALSE;
+                } /* else (!vol->nls_map) */
+                ntfs_debug("Using NLS character set %s.", nls_map->charset);
+                vol->nls_map = nls_map;
+        } else /* (!nls_map) */ {
+                if (!vol->nls_map) {
+                        vol->nls_map = load_nls_default();
+                        if (!vol->nls_map) {
+                                ntfs_error(vol->sb, "Failed to load default "
+                                                "NLS character set.");
+                                return FALSE;
+                        }
+                        ntfs_debug("Using default NLS character set (%s).",
+                                        vol->nls_map->charset);
+                }
+        }
+        if (mft_zone_multiplier != -1) {
+                if (vol->mft_zone_multiplier && vol->mft_zone_multiplier !=
+                                mft_zone_multiplier) {
+                        ntfs_error(vol->sb, "Cannot change mft_zone_multiplier "
+                                        "on remount.");
+                        return FALSE;
+                }
+                if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) {
+                        ntfs_error(vol->sb, "Invalid mft_zone_multiplier. "
+                                        "Using default value, i.e. 1.");
+                        mft_zone_multiplier = 1;
+                }
+                vol->mft_zone_multiplier = mft_zone_multiplier;
+        }
+        if (!vol->mft_zone_multiplier)
+                vol->mft_zone_multiplier = 1;
+        if (on_errors != -1)
+                vol->on_errors = on_errors;
+        if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER)
+                vol->on_errors |= ON_ERRORS_CONTINUE;
+        if (uid != (uid_t)-1)
+                vol->uid = uid;
+        if (gid != (gid_t)-1)
+                vol->gid = gid;
+        if (fmask != (mode_t)-1)
+                vol->fmask = fmask;
+        if (dmask != (mode_t)-1)
+                vol->dmask = dmask;
+        if (show_sys_files != -1) {
+                if (show_sys_files)
+                        NVolSetShowSystemFiles(vol);
+                else
+                        NVolClearShowSystemFiles(vol);
+        }
+        if (case_sensitive != -1) {
+                if (case_sensitive)
+                        NVolSetCaseSensitive(vol);
+                else
+                        NVolClearCaseSensitive(vol);
+        }
+        return TRUE;
+needs_arg:
+        ntfs_error(vol->sb, "The %s option requires an argument.", p);
+        return FALSE;
+needs_bool:
+        ntfs_error(vol->sb, "The %s option requires a boolean argument.", p);
+        return FALSE;
+needs_val:
+        ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov);
+        return FALSE;
+}
+#ifdef NTFS_RW
+/**
+ * ntfs_write_volume_flags - write new flags to the volume information flags
+ * @vol:        ntfs volume on which to modify the flags
+ * @flags:      new flags value for the volume information flags
+ *
+ * Internal function.  You probably want to use ntfs_{set,clear}_volume_flags()
+ * instead (see below).
+ *
+ * Replace the volume information flags on the volume @vol with the value
+ * supplied in @flags.  Note, this overwrites the volume information flags, so
+ * make sure to combine the flags you want to modify with the old flags and use
+ * the result when calling ntfs_write_volume_flags().
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags)
+{
+        ntfs_inode *ni = NTFS_I(vol->vol_ino);
+        MFT_RECORD *m;
+        VOLUME_INFORMATION *vi;
+        ntfs_attr_search_ctx *ctx;
+        int err;
+        ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.",
+                        le16_to_cpu(vol->vol_flags), le16_to_cpu(flags));
+        if (vol->vol_flags == flags)
+                goto done;
+        BUG_ON(!ni);
+        m = map_mft_record(ni);
+        if (IS_ERR(m)) {
+                err = PTR_ERR(m);
+                goto err_out;
+        }
+        ctx = ntfs_attr_get_search_ctx(ni, m);
+        if (!ctx) {
+                err = -ENOMEM;
+                goto put_unm_err_out;
+        }
+        err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
+                        ctx);
+        if (err)
+                goto put_unm_err_out;
+        vi = (VOLUME_INFORMATION*)((u8*)ctx->attr +
+                        le16_to_cpu(ctx->attr->data.resident.value_offset));
+        vol->vol_flags = vi->flags = flags;
+        flush_dcache_mft_record_page(ctx->ntfs_ino);
+        mark_mft_record_dirty(ctx->ntfs_ino);
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(ni);
+done:
+        ntfs_debug("Done.");
+        return 0;
+put_unm_err_out:
+        if (ctx)
+                ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(ni);
+err_out:
+        ntfs_error(vol->sb, "Failed with error code %i.", -err);
+        return err;
+}
+/**
+ * ntfs_set_volume_flags - set bits in the volume information flags
+ * @vol:        ntfs volume on which to modify the flags
+ * @flags:      flags to set on the volume
+ *
+ * Set the bits in @flags in the volume information flags on the volume @vol.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
+{
+        flags &= VOLUME_FLAGS_MASK;
+        return ntfs_write_volume_flags(vol, vol->vol_flags | flags);
+}
+/**
+ * ntfs_clear_volume_flags - clear bits in the volume information flags
+ * @vol:        ntfs volume on which to modify the flags
+ * @flags:      flags to clear on the volume
+ *
+ * Clear the bits in @flags in the volume information flags on the volume @vol.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
+{
+        flags &= VOLUME_FLAGS_MASK;
+        flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags));
+        return ntfs_write_volume_flags(vol, flags);
+}
+#endif /* NTFS_RW */
+/**
+ * ntfs_remount - change the mount options of a mounted ntfs filesystem
+ * @sb:         superblock of mounted ntfs filesystem
+ * @flags:      remount flags
+ * @opt:        remount options string
+ *
+ * Change the mount options of an already mounted ntfs filesystem.
+ *
+ * NOTE:  The VFS sets the @sb->s_flags remount flags to @flags after
+ * ntfs_remount() returns successfully (i.e. returns 0).  Otherwise,
+ * @sb->s_flags are not changed.
+ */
+static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
+{
+        ntfs_volume *vol = NTFS_SB(sb);
+        ntfs_debug("Entering with remount options string: %s", opt);
+#ifndef NTFS_RW
+        /* For read-only compiled driver, enforce all read-only flags. */
+        *flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+#else /* NTFS_RW */
+        /*
+         * For the read-write compiled driver, if we are remounting read-write,
+         * make sure there are no volume errors and that no unsupported volume
+         * flags are set.  Also, empty the logfile journal as it would become
+         * stale as soon as something is written to the volume and mark the
+         * volume dirty so that chkdsk is run if the volume is not umounted
+         * cleanly.  Finally, mark the quotas out of date so Windows rescans
+         * the volume on boot and updates them.
+         *
+         * When remounting read-only, mark the volume clean if no volume errors
+         * have occured.
+         */
+        if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+                static const char *es = ".  Cannot remount read-write.";
+                /* Remounting read-write. */
+                if (NVolErrors(vol)) {
+                        ntfs_error(sb, "Volume has errors and is read-only%s",
+                                        es);
+                        return -EROFS;
+                }
+                if (vol->vol_flags & VOLUME_IS_DIRTY) {
+                        ntfs_error(sb, "Volume is dirty and read-only%s", es);
+                        return -EROFS;
+                }
+                if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
+                        ntfs_error(sb, "Volume has unsupported flags set and "
+                                        "is read-only%s", es);
+                        return -EROFS;
+                }
+                if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
+                        ntfs_error(sb, "Failed to set dirty bit in volume "
+                                        "information flags%s", es);
+                        return -EROFS;
+                }
+#if 0
+                // TODO: Enable this code once we start modifying anything that
+                //       is different between NTFS 1.2 and 3.x...
+                /* Set NT4 compatibility flag on newer NTFS version volumes. */
+                if ((vol->major_ver > 1)) {
+                        if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
+                                ntfs_error(sb, "Failed to set NT4 "
+                                                "compatibility flag%s", es);
+                                NVolSetErrors(vol);
+                                return -EROFS;
+                        }
+                }
+#endif
+                if (!ntfs_empty_logfile(vol->logfile_ino)) {
+                        ntfs_error(sb, "Failed to empty journal $LogFile%s",
+                                        es);
+                        NVolSetErrors(vol);
+                        return -EROFS;
+                }
+                if (!ntfs_mark_quotas_out_of_date(vol)) {
+                        ntfs_error(sb, "Failed to mark quotas out of date%s",
+                                        es);
+                        NVolSetErrors(vol);
+                        return -EROFS;
+                }
+        } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
+                /* Remounting read-only. */
+                if (!NVolErrors(vol)) {
+                        if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
+                                ntfs_warning(sb, "Failed to clear dirty bit "
+                                                "in volume information "
+                                                "flags.  Run chkdsk.");
+                }
+        }
+#endif /* NTFS_RW */
+        // TODO: Deal with *flags.
+        if (!parse_options(vol, opt))
+                return -EINVAL;
+        ntfs_debug("Done.");
+        return 0;
+}
+/**
+ * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector
+ * @sb:         Super block of the device to which @b belongs.
+ * @b:          Boot sector of device @sb to check.
+ * @silent:     If TRUE, all output will be silenced.
+ *
+ * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot
+ * sector. Returns TRUE if it is valid and FALSE if not.
+ *
+ * @sb is only needed for warning/error output, i.e. it can be NULL when silent
+ * is TRUE.
+ */
+static BOOL is_boot_sector_ntfs(const struct super_block *sb,
+                const NTFS_BOOT_SECTOR *b, const BOOL silent)
+{
+        /*
+         * Check that checksum == sum of u32 values from b to the checksum
+         * field. If checksum is zero, no checking is done.
+         */
+        if ((void*)b < (void*)&b->checksum && b->checksum) {
+                le32 *u;
+                u32 i;
+                for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u)
+                        i += le32_to_cpup(u);
+                if (le32_to_cpu(b->checksum) != i)
+                        goto not_ntfs;
+        }
+        /* Check OEMidentifier is "NTFS    " */
+        if (b->oem_id != magicNTFS)
+                goto not_ntfs;
+        /* Check bytes per sector value is between 256 and 4096. */
+        if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 ||
+                        le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000)
+                goto not_ntfs;
+        /* Check sectors per cluster value is valid. */
+        switch (b->bpb.sectors_per_cluster) {
+        case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128:
+                break;
+        default:
+                goto not_ntfs;
+        }
+        /* Check the cluster size is not above 65536 bytes. */
+        if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) *
+                        b->bpb.sectors_per_cluster > 0x10000)
+                goto not_ntfs;
+        /* Check reserved/unused fields are really zero. */
+        if (le16_to_cpu(b->bpb.reserved_sectors) ||
+                        le16_to_cpu(b->bpb.root_entries) ||
+                        le16_to_cpu(b->bpb.sectors) ||
+                        le16_to_cpu(b->bpb.sectors_per_fat) ||
+                        le32_to_cpu(b->bpb.large_sectors) || b->bpb.fats)
+                goto not_ntfs;
+        /* Check clusters per file mft record value is valid. */
+        if ((u8)b->clusters_per_mft_record < 0xe1 ||
+                        (u8)b->clusters_per_mft_record > 0xf7)
+                switch (b->clusters_per_mft_record) {
+                case 1: case 2: case 4: case 8: case 16: case 32: case 64:
+                        break;
+                default:
+                        goto not_ntfs;
+                }
+        /* Check clusters per index block value is valid. */
+        if ((u8)b->clusters_per_index_record < 0xe1 ||
+                        (u8)b->clusters_per_index_record > 0xf7)
+                switch (b->clusters_per_index_record) {
+                case 1: case 2: case 4: case 8: case 16: case 32: case 64:
+                        break;
+                default:
+                        goto not_ntfs;
+                }
+        /*
+         * Check for valid end of sector marker. We will work without it, but
+         * many BIOSes will refuse to boot from a bootsector if the magic is
+         * incorrect, so we emit a warning.
+         */
+        if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
+                ntfs_warning(sb, "Invalid end of sector marker.");
+        return TRUE;
+not_ntfs:
+        return FALSE;
+}
+/**
+ * read_ntfs_boot_sector - read the NTFS boot sector of a device
+ * @sb:         super block of device to read the boot sector from
+ * @silent:     if true, suppress all output
+ *
+ * Reads the boot sector from the device and validates it. If that fails, tries
+ * to read the backup boot sector, first from the end of the device a-la NT4 and
+ * later and then from the middle of the device a-la NT3.51 and before.
+ *
+ * If a valid boot sector is found but it is not the primary boot sector, we
+ * repair the primary boot sector silently (unless the device is read-only or
+ * the primary boot sector is not accessible).
+ *
+ * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super
+ * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized
+ * to their respective values.
+ *
+ * Return the unlocked buffer head containing the boot sector or NULL on error.
+ */
+static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb,
+                const int silent)
+{
+        const char *read_err_str = "Unable to read %s boot sector.";
+        struct buffer_head *bh_primary, *bh_backup;
+        long nr_blocks = NTFS_SB(sb)->nr_blocks;
+        /* Try to read primary boot sector. */
+        if ((bh_primary = sb_bread(sb, 0))) {
+                if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
+                                bh_primary->b_data, silent))
+                        return bh_primary;
+                if (!silent)
+                        ntfs_error(sb, "Primary boot sector is invalid.");
+        } else if (!silent)
+                ntfs_error(sb, read_err_str, "primary");
+        if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) {
+                if (bh_primary)
+                        brelse(bh_primary);
+                if (!silent)
+                        ntfs_error(sb, "Mount option errors=recover not used. "
+                                        "Aborting without trying to recover.");
+                return NULL;
+        }
+        /* Try to read NT4+ backup boot sector. */
+        if ((bh_backup = sb_bread(sb, nr_blocks - 1))) {
+                if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
+                                bh_backup->b_data, silent))
+                        goto hotfix_primary_boot_sector;
+                brelse(bh_backup);
+        } else if (!silent)
+                ntfs_error(sb, read_err_str, "backup");
+        /* Try to read NT3.51- backup boot sector. */
+        if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) {
+                if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
+                                bh_backup->b_data, silent))
+                        goto hotfix_primary_boot_sector;
+                if (!silent)
+                        ntfs_error(sb, "Could not find a valid backup boot "
+                                        "sector.");
+                brelse(bh_backup);
+        } else if (!silent)
+                ntfs_error(sb, read_err_str, "backup");
+        /* We failed. Cleanup and return. */
+        if (bh_primary)
+                brelse(bh_primary);
+        return NULL;
+hotfix_primary_boot_sector:
+        if (bh_primary) {
+                /*
+                 * If we managed to read sector zero and the volume is not
+                 * read-only, copy the found, valid backup boot sector to the
+                 * primary boot sector.
+                 */
+                if (!(sb->s_flags & MS_RDONLY)) {
+                        ntfs_warning(sb, "Hot-fix: Recovering invalid primary "
+                                        "boot sector from backup copy.");
+                        memcpy(bh_primary->b_data, bh_backup->b_data,
+                                        sb->s_blocksize);
+                        mark_buffer_dirty(bh_primary);
+                        sync_dirty_buffer(bh_primary);
+                        if (buffer_uptodate(bh_primary)) {
+                                brelse(bh_backup);
+                                return bh_primary;
+                        }
+                        ntfs_error(sb, "Hot-fix: Device write error while "
+                                        "recovering primary boot sector.");
+                } else {
+                        ntfs_warning(sb, "Hot-fix: Recovery of primary boot "
+                                        "sector failed: Read-only mount.");
+                }
+                brelse(bh_primary);
+        }
+        ntfs_warning(sb, "Using backup boot sector.");
+        return bh_backup;
+}
+/**
+ * parse_ntfs_boot_sector - parse the boot sector and store the data in @vol
+ * @vol:        volume structure to initialise with data from boot sector
+ * @b:          boot sector to parse
+ *
+ * Parse the ntfs boot sector @b and store all imporant information therein in
+ * the ntfs super block @vol.  Return TRUE on success and FALSE on error.
+ */
+static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
+{
+        unsigned int sectors_per_cluster_bits, nr_hidden_sects;
+        int clusters_per_mft_record, clusters_per_index_record;
+        s64 ll;
+        vol->sector_size = le16_to_cpu(b->bpb.bytes_per_sector);
+        vol->sector_size_bits = ffs(vol->sector_size) - 1;
+        ntfs_debug("vol->sector_size = %i (0x%x)", vol->sector_size,
+                        vol->sector_size);
+        ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits,
+                        vol->sector_size_bits);
+        if (vol->sector_size != vol->sb->s_blocksize)
+                ntfs_warning(vol->sb, "The boot sector indicates a sector size "
+                                "different from the device sector size.");
+        ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster);
+        sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1;
+        ntfs_debug("sectors_per_cluster_bits = 0x%x",
+                        sectors_per_cluster_bits);
+        nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors);
+        ntfs_debug("number of hidden sectors = 0x%x", nr_hidden_sects);
+        vol->cluster_size = vol->sector_size << sectors_per_cluster_bits;
+        vol->cluster_size_mask = vol->cluster_size - 1;
+        vol->cluster_size_bits = ffs(vol->cluster_size) - 1;
+        ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size,
+                        vol->cluster_size);
+        ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask);
+        ntfs_debug("vol->cluster_size_bits = %i (0x%x)",
+                        vol->cluster_size_bits, vol->cluster_size_bits);
+        if (vol->sector_size > vol->cluster_size) {
+                ntfs_error(vol->sb, "Sector sizes above the cluster size are "
+                                "not supported.  Sorry.");
+                return FALSE;
+        }
+        if (vol->sb->s_blocksize > vol->cluster_size) {
+                ntfs_error(vol->sb, "Cluster sizes smaller than the device "
+                                "sector size are not supported.  Sorry.");
+                return FALSE;
+        }
+        clusters_per_mft_record = b->clusters_per_mft_record;
+        ntfs_debug("clusters_per_mft_record = %i (0x%x)",
+                        clusters_per_mft_record, clusters_per_mft_record);
+        if (clusters_per_mft_record > 0)
+                vol->mft_record_size = vol->cluster_size <<
+                                (ffs(clusters_per_mft_record) - 1);
+        else
+                /*
+                 * When mft_record_size < cluster_size, clusters_per_mft_record
+                 * = -log2(mft_record_size) bytes. mft_record_size normaly is
+                 * 1024 bytes, which is encoded as 0xF6 (-10 in decimal).
+                 */
+                vol->mft_record_size = 1 << -clusters_per_mft_record;
+        vol->mft_record_size_mask = vol->mft_record_size - 1;
+        vol->mft_record_size_bits = ffs(vol->mft_record_size) - 1;
+        ntfs_debug("vol->mft_record_size = %i (0x%x)", vol->mft_record_size,
+                        vol->mft_record_size);
+        ntfs_debug("vol->mft_record_size_mask = 0x%x",
+                        vol->mft_record_size_mask);
+        ntfs_debug("vol->mft_record_size_bits = %i (0x%x)",
+                        vol->mft_record_size_bits, vol->mft_record_size_bits);
+        /*
+         * We cannot support mft record sizes above the PAGE_CACHE_SIZE since
+         * we store $MFT/$DATA, the table of mft records in the page cache.
+         */
+        if (vol->mft_record_size > PAGE_CACHE_SIZE) {
+                ntfs_error(vol->sb, "Mft record size %i (0x%x) exceeds the "
+                                "page cache size on your system %lu (0x%lx).  "
+                                "This is not supported.  Sorry.",
+                                vol->mft_record_size, vol->mft_record_size,
+                                PAGE_CACHE_SIZE, PAGE_CACHE_SIZE);
+                return FALSE;
+        }
+        clusters_per_index_record = b->clusters_per_index_record;
+        ntfs_debug("clusters_per_index_record = %i (0x%x)",
+                        clusters_per_index_record, clusters_per_index_record);
+        if (clusters_per_index_record > 0)
+                vol->index_record_size = vol->cluster_size <<
+                                (ffs(clusters_per_index_record) - 1);
+        else
+                /*
+                 * When index_record_size < cluster_size,
+                 * clusters_per_index_record = -log2(index_record_size) bytes.
+                 * index_record_size normaly equals 4096 bytes, which is
+                 * encoded as 0xF4 (-12 in decimal).
+                 */
+                vol->index_record_size = 1 << -clusters_per_index_record;
+        vol->index_record_size_mask = vol->index_record_size - 1;
+        vol->index_record_size_bits = ffs(vol->index_record_size) - 1;
+        ntfs_debug("vol->index_record_size = %i (0x%x)",
+                        vol->index_record_size, vol->index_record_size);
+        ntfs_debug("vol->index_record_size_mask = 0x%x",
+                        vol->index_record_size_mask);
+        ntfs_debug("vol->index_record_size_bits = %i (0x%x)",
+                        vol->index_record_size_bits,
+                        vol->index_record_size_bits);
+        /*
+         * Get the size of the volume in clusters and check for 64-bit-ness.
+         * Windows currently only uses 32 bits to save the clusters so we do
+         * the same as it is much faster on 32-bit CPUs.
+         */
+        ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits;
+        if ((u64)ll >= 1ULL << 32) {
+                ntfs_error(vol->sb, "Cannot handle 64-bit clusters.  Sorry.");
+                return FALSE;
+        }
+        vol->nr_clusters = ll;
+        ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters);
+        /*
+         * On an architecture where unsigned long is 32-bits, we restrict the
+         * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler
+         * will hopefully optimize the whole check away.
+         */
+        if (sizeof(unsigned long) < 8) {
+                if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) {
+                        ntfs_error(vol->sb, "Volume size (%lluTiB) is too "
+                                        "large for this architecture.  "
+                                        "Maximum supported is 2TiB.  Sorry.",
+                                        (unsigned long long)ll >> (40 -
+                                        vol->cluster_size_bits));
+                        return FALSE;
+                }
+        }
+        ll = sle64_to_cpu(b->mft_lcn);
+        if (ll >= vol->nr_clusters) {
+                ntfs_error(vol->sb, "MFT LCN is beyond end of volume.  Weird.");
+                return FALSE;
+        }
+        vol->mft_lcn = ll;
+        ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn);
+        ll = sle64_to_cpu(b->mftmirr_lcn);
+        if (ll >= vol->nr_clusters) {
+                ntfs_error(vol->sb, "MFTMirr LCN is beyond end of volume.  "
+                                "Weird.");
+                return FALSE;
+        }
+        vol->mftmirr_lcn = ll;
+        ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn);
+#ifdef NTFS_RW
+        /*
+         * Work out the size of the mft mirror in number of mft records. If the
+         * cluster size is less than or equal to the size taken by four mft
+         * records, the mft mirror stores the first four mft records. If the
+         * cluster size is bigger than the size taken by four mft records, the
+         * mft mirror contains as many mft records as will fit into one
+         * cluster.
+         */
+        if (vol->cluster_size <= (4 << vol->mft_record_size_bits))
+                vol->mftmirr_size = 4;
+        else
+                vol->mftmirr_size = vol->cluster_size >>
+                                vol->mft_record_size_bits;
+        ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size);
+#endif /* NTFS_RW */
+        vol->serial_no = le64_to_cpu(b->volume_serial_number);
+        ntfs_debug("vol->serial_no = 0x%llx",
+                        (unsigned long long)vol->serial_no);
+        return TRUE;
+}
+/**
+ * ntfs_setup_allocators - initialize the cluster and mft allocators
+ * @vol:        volume structure for which to setup the allocators
+ *
+ * Setup the cluster (lcn) and mft allocators to the starting values.
+ */
+static void ntfs_setup_allocators(ntfs_volume *vol)
+{
+#ifdef NTFS_RW
+        LCN mft_zone_size, mft_lcn;
+#endif /* NTFS_RW */
+        ntfs_debug("vol->mft_zone_multiplier = 0x%x",
+                        vol->mft_zone_multiplier);
+#ifdef NTFS_RW
+        /* Determine the size of the MFT zone. */
+        mft_zone_size = vol->nr_clusters;
+        switch (vol->mft_zone_multiplier) {  /* % of volume size in clusters */
+        case 4:
+                mft_zone_size >>= 1;                    /* 50%   */
+                break;
+        case 3:
+                mft_zone_size = (mft_zone_size +
+                                (mft_zone_size >> 1)) >> 2;     /* 37.5% */
+                break;
+        case 2:
+                mft_zone_size >>= 2;                    /* 25%   */
+                break;
+        /* case 1: */
+        default:
+                mft_zone_size >>= 3;                    /* 12.5% */
+                break;
+        }
+        /* Setup the mft zone. */
+        vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn;
+        ntfs_debug("vol->mft_zone_pos = 0x%llx",
+                        (unsigned long long)vol->mft_zone_pos);
+        /*
+         * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs
+         * source) and if the actual mft_lcn is in the expected place or even
+         * further to the front of the volume, extend the mft_zone to cover the
+         * beginning of the volume as well.  This is in order to protect the
+         * area reserved for the mft bitmap as well within the mft_zone itself.
+         * On non-standard volumes we do not protect it as the overhead would
+         * be higher than the speed increase we would get by doing it.
+         */
+        mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size;
+        if (mft_lcn * vol->cluster_size < 16 * 1024)
+                mft_lcn = (16 * 1024 + vol->cluster_size - 1) /
+                                vol->cluster_size;
+        if (vol->mft_zone_start <= mft_lcn)
+                vol->mft_zone_start = 0;
+        ntfs_debug("vol->mft_zone_start = 0x%llx",
+                        (unsigned long long)vol->mft_zone_start);
+        /*
+         * Need to cap the mft zone on non-standard volumes so that it does
+         * not point outside the boundaries of the volume.  We do this by
+         * halving the zone size until we are inside the volume.
+         */
+        vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
+        while (vol->mft_zone_end >= vol->nr_clusters) {
+                mft_zone_size >>= 1;
+                vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
+        }
+        ntfs_debug("vol->mft_zone_end = 0x%llx",
+                        (unsigned long long)vol->mft_zone_end);
+        /*
+         * Set the current position within each data zone to the start of the
+         * respective zone.
+         */
+        vol->data1_zone_pos = vol->mft_zone_end;
+        ntfs_debug("vol->data1_zone_pos = 0x%llx",
+                        (unsigned long long)vol->data1_zone_pos);
+        vol->data2_zone_pos = 0;
+        ntfs_debug("vol->data2_zone_pos = 0x%llx",
+                        (unsigned long long)vol->data2_zone_pos);
+        /* Set the mft data allocation position to mft record 24. */
+        vol->mft_data_pos = 24;
+        ntfs_debug("vol->mft_data_pos = 0x%llx",
+                        (unsigned long long)vol->mft_data_pos);
+#endif /* NTFS_RW */
+}
+#ifdef NTFS_RW
+/**
+ * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume
+ * @vol:        ntfs super block describing device whose mft mirror to load
+ *
+ * Return TRUE on success or FALSE on error.
+ */
+static BOOL load_and_init_mft_mirror(ntfs_volume *vol)
+{
+        struct inode *tmp_ino;
+        ntfs_inode *tmp_ni;
+        ntfs_debug("Entering.");
+        /* Get mft mirror inode. */
+        tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr);
+        if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
+                if (!IS_ERR(tmp_ino))
+                        iput(tmp_ino);
+                /* Caller will display error message. */
+                return FALSE;
+        }
+        /*
+         * Re-initialize some specifics about $MFTMirr's inode as
+         * ntfs_read_inode() will have set up the default ones.
+         */
+        /* Set uid and gid to root. */
+        tmp_ino->i_uid = tmp_ino->i_gid = 0;
+        /* Regular file.  No access for anyone. */
+        tmp_ino->i_mode = S_IFREG;
+        /* No VFS initiated operations allowed for $MFTMirr. */
+        tmp_ino->i_op = &ntfs_empty_inode_ops;
+        tmp_ino->i_fop = &ntfs_empty_file_ops;
+        /* Put in our special address space operations. */
+        tmp_ino->i_mapping->a_ops = &ntfs_mst_aops;
+        tmp_ni = NTFS_I(tmp_ino);
+        /* The $MFTMirr, like the $MFT is multi sector transfer protected. */
+        NInoSetMstProtected(tmp_ni);
+        /*
+         * Set up our little cheat allowing us to reuse the async read io
+         * completion handler for directories.
+         */
+        tmp_ni->itype.index.block_size = vol->mft_record_size;
+        tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits;
+        vol->mftmirr_ino = tmp_ino;
+        ntfs_debug("Done.");
+        return TRUE;
+}
+/**
+ * check_mft_mirror - compare contents of the mft mirror with the mft
+ * @vol:        ntfs super block describing device whose mft mirror to check
+ *
+ * Return TRUE on success or FALSE on error.
+ *
+ * Note, this function also results in the mft mirror runlist being completely
+ * mapped into memory.  The mft mirror write code requires this and will BUG()
+ * should it find an unmapped runlist element.
+ */
+static BOOL check_mft_mirror(ntfs_volume *vol)
+{
+        unsigned long index;
+        struct super_block *sb = vol->sb;
+        ntfs_inode *mirr_ni;
+        struct page *mft_page, *mirr_page;
+        u8 *kmft, *kmirr;
+        runlist_element *rl, rl2[2];
+        int mrecs_per_page, i;
+        ntfs_debug("Entering.");
+        /* Compare contents of $MFT and $MFTMirr. */
+        mrecs_per_page = PAGE_CACHE_SIZE / vol->mft_record_size;
+        BUG_ON(!mrecs_per_page);
+        BUG_ON(!vol->mftmirr_size);
+        mft_page = mirr_page = NULL;
+        kmft = kmirr = NULL;
+        index = i = 0;
+        do {
+                u32 bytes;
+                /* Switch pages if necessary. */
+                if (!(i % mrecs_per_page)) {
+                        if (index) {
+                                ntfs_unmap_page(mft_page);
+                                ntfs_unmap_page(mirr_page);
+                        }
+                        /* Get the $MFT page. */
+                        mft_page = ntfs_map_page(vol->mft_ino->i_mapping,
+                                        index);
+                        if (IS_ERR(mft_page)) {
+                                ntfs_error(sb, "Failed to read $MFT.");
+                                return FALSE;
+                        }
+                        kmft = page_address(mft_page);
+                        /* Get the $MFTMirr page. */
+                        mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping,
+                                        index);
+                        if (IS_ERR(mirr_page)) {
+                                ntfs_error(sb, "Failed to read $MFTMirr.");
+                                goto mft_unmap_out;
+                        }
+                        kmirr = page_address(mirr_page);
+                        ++index;
+                }
+                /* Make sure the record is ok. */
+                if (ntfs_is_baad_recordp((le32*)kmft)) {
+                        ntfs_error(sb, "Incomplete multi sector transfer "
+                                        "detected in mft record %i.", i);
+mm_unmap_out:
+                        ntfs_unmap_page(mirr_page);
+mft_unmap_out:
+                        ntfs_unmap_page(mft_page);
+                        return FALSE;
+                }
+                if (ntfs_is_baad_recordp((le32*)kmirr)) {
+                        ntfs_error(sb, "Incomplete multi sector transfer "
+                                        "detected in mft mirror record %i.", i);
+                        goto mm_unmap_out;
+                }
+                /* Get the amount of data in the current record. */
+                bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use);
+                if (!bytes || bytes > vol->mft_record_size) {
+                        bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use);
+                        if (!bytes || bytes > vol->mft_record_size)
+                                bytes = vol->mft_record_size;
+                }
+                /* Compare the two records. */
+                if (memcmp(kmft, kmirr, bytes)) {
+                        ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not "
+                                        "match.  Run ntfsfix or chkdsk.", i);
+                        goto mm_unmap_out;
+                }
+                kmft += vol->mft_record_size;
+                kmirr += vol->mft_record_size;
+        } while (++i < vol->mftmirr_size);
+        /* Release the last pages. */
+        ntfs_unmap_page(mft_page);
+        ntfs_unmap_page(mirr_page);
+        /* Construct the mft mirror runlist by hand. */
+        rl2[0].vcn = 0;
+        rl2[0].lcn = vol->mftmirr_lcn;
+        rl2[0].length = (vol->mftmirr_size * vol->mft_record_size +
+                        vol->cluster_size - 1) / vol->cluster_size;
+        rl2[1].vcn = rl2[0].length;
+        rl2[1].lcn = LCN_ENOENT;
+        rl2[1].length = 0;
+        /*
+         * Because we have just read all of the mft mirror, we know we have
+         * mapped the full runlist for it.
+         */
+        mirr_ni = NTFS_I(vol->mftmirr_ino);
+        down_read(&mirr_ni->runlist.lock);
+        rl = mirr_ni->runlist.rl;
+        /* Compare the two runlists.  They must be identical. */
+        i = 0;
+        do {
+                if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn ||
+                                rl2[i].length != rl[i].length) {
+                        ntfs_error(sb, "$MFTMirr location mismatch.  "
+                                        "Run chkdsk.");
+                        up_read(&mirr_ni->runlist.lock);
+                        return FALSE;
+                }
+        } while (rl2[i++].length);
+        up_read(&mirr_ni->runlist.lock);
+        ntfs_debug("Done.");
+        return TRUE;
+}
+/**
+ * load_and_check_logfile - load and check the logfile inode for a volume
+ * @vol:        ntfs super block describing device whose logfile to load
+ *
+ * Return TRUE on success or FALSE on error.
+ */
+static BOOL load_and_check_logfile(ntfs_volume *vol)
+{
+        struct inode *tmp_ino;
+        ntfs_debug("Entering.");
+        tmp_ino = ntfs_iget(vol->sb, FILE_LogFile);
+        if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
+                if (!IS_ERR(tmp_ino))
+                        iput(tmp_ino);
+                /* Caller will display error message. */
+                return FALSE;
+        }
+        if (!ntfs_check_logfile(tmp_ino)) {
+                iput(tmp_ino);
+                /* ntfs_check_logfile() will have displayed error output. */
+                return FALSE;
+        }
+        vol->logfile_ino = tmp_ino;
+        ntfs_debug("Done.");
+        return TRUE;
+}
+/**
+ * load_and_init_quota - load and setup the quota file for a volume if present
+ * @vol:        ntfs super block describing device whose quota file to load
+ *
+ * Return TRUE on success or FALSE on error.  If $Quota is not present, we
+ * leave vol->quota_ino as NULL and return success.
+ */
+static BOOL load_and_init_quota(ntfs_volume *vol)
+{
+        MFT_REF mref;
+        struct inode *tmp_ino;
+        ntfs_name *name = NULL;
+        static const ntfschar Quota[7] = { const_cpu_to_le16('$'),
+                        const_cpu_to_le16('Q'), const_cpu_to_le16('u'),
+                        const_cpu_to_le16('o'), const_cpu_to_le16('t'),
+                        const_cpu_to_le16('a'), 0 };
+        static ntfschar Q[3] = { const_cpu_to_le16('$'),
+                        const_cpu_to_le16('Q'), 0 };
+        ntfs_debug("Entering.");
+        /*
+         * Find the inode number for the quota file by looking up the filename
+         * $Quota in the extended system files directory $Extend.
+         */
+        down(&vol->extend_ino->i_sem);
+        mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
+                        &name);
+        up(&vol->extend_ino->i_sem);
+        if (IS_ERR_MREF(mref)) {
+                /*
+                 * If the file does not exist, quotas are disabled and have
+                 * never been enabled on this volume, just return success.
+                 */
+                if (MREF_ERR(mref) == -ENOENT) {
+                        ntfs_debug("$Quota not present.  Volume does not have "
+                                        "quotas enabled.");
+                        /*
+                         * No need to try to set quotas out of date if they are
+                         * not enabled.
+                         */
+                        NVolSetQuotaOutOfDate(vol);
+                        return TRUE;
+                }
+                /* A real error occured. */
+                ntfs_error(vol->sb, "Failed to find inode number for $Quota.");
+                return FALSE;
+        }
+        /* We do not care for the type of match that was found. */
+        if (name)
+                kfree(name);
+        /* Get the inode. */
+        tmp_ino = ntfs_iget(vol->sb, MREF(mref));
+        if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
+                if (!IS_ERR(tmp_ino))
+                        iput(tmp_ino);
+                ntfs_error(vol->sb, "Failed to load $Quota.");
+                return FALSE;
+        }
+        vol->quota_ino = tmp_ino;
+        /* Get the $Q index allocation attribute. */
+        tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2);
+        if (IS_ERR(tmp_ino)) {
+                ntfs_error(vol->sb, "Failed to load $Quota/$Q index.");
+                return FALSE;
+        }
+        vol->quota_q_ino = tmp_ino;
+        ntfs_debug("Done.");
+        return TRUE;
+}
+/**
+ * load_and_init_attrdef - load the attribute definitions table for a volume
+ * @vol:        ntfs super block describing device whose attrdef to load
+ *
+ * Return TRUE on success or FALSE on error.
+ */
+static BOOL load_and_init_attrdef(ntfs_volume *vol)
+{
+        struct super_block *sb = vol->sb;
+        struct inode *ino;
+        struct page *page;
+        unsigned long index, max_index;
+        unsigned int size;
+        ntfs_debug("Entering.");
+        /* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */
+        ino = ntfs_iget(sb, FILE_AttrDef);
+        if (IS_ERR(ino) || is_bad_inode(ino)) {
+                if (!IS_ERR(ino))
+                        iput(ino);
+                goto failed;
+        }
+        /* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */
+        if (!ino->i_size || ino->i_size > 0x7fffffff)
+                goto iput_failed;
+        vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(ino->i_size);
+        if (!vol->attrdef)
+                goto iput_failed;
+        index = 0;
+        max_index = ino->i_size >> PAGE_CACHE_SHIFT;
+        size = PAGE_CACHE_SIZE;
+        while (index < max_index) {
+                /* Read the attrdef table and copy it into the linear buffer. */
+read_partial_attrdef_page:
+                page = ntfs_map_page(ino->i_mapping, index);
+                if (IS_ERR(page))
+                        goto free_iput_failed;
+                memcpy((u8*)vol->attrdef + (index++ << PAGE_CACHE_SHIFT),
+                                page_address(page), size);
+                ntfs_unmap_page(page);
+        };
+        if (size == PAGE_CACHE_SIZE) {
+                size = ino->i_size & ~PAGE_CACHE_MASK;
+                if (size)
+                        goto read_partial_attrdef_page;
+        }
+        vol->attrdef_size = ino->i_size;
+        ntfs_debug("Read %llu bytes from $AttrDef.", ino->i_size);
+        iput(ino);
+        return TRUE;
+free_iput_failed:
+        ntfs_free(vol->attrdef);
+        vol->attrdef = NULL;
+iput_failed:
+        iput(ino);
+failed:
+        ntfs_error(sb, "Failed to initialize attribute definition table.");
+        return FALSE;
+}
+#endif /* NTFS_RW */
+/**
+ * load_and_init_upcase - load the upcase table for an ntfs volume
+ * @vol:        ntfs super block describing device whose upcase to load
+ *
+ * Return TRUE on success or FALSE on error.
+ */
+static BOOL load_and_init_upcase(ntfs_volume *vol)
+{
+        struct super_block *sb = vol->sb;
+        struct inode *ino;
+        struct page *page;
+        unsigned long index, max_index;
+        unsigned int size;
+        int i, max;
+        ntfs_debug("Entering.");
+        /* Read upcase table and setup vol->upcase and vol->upcase_len. */
+        ino = ntfs_iget(sb, FILE_UpCase);
+        if (IS_ERR(ino) || is_bad_inode(ino)) {
+                if (!IS_ERR(ino))
+                        iput(ino);
+                goto upcase_failed;
+        }
+        /*
+         * The upcase size must not be above 64k Unicode characters, must not
+         * be zero and must be a multiple of sizeof(ntfschar).
+         */
+        if (!ino->i_size || ino->i_size & (sizeof(ntfschar) - 1) ||
+                        ino->i_size > 64ULL * 1024 * sizeof(ntfschar))
+                goto iput_upcase_failed;
+        vol->upcase = (ntfschar*)ntfs_malloc_nofs(ino->i_size);
+        if (!vol->upcase)
+                goto iput_upcase_failed;
+        index = 0;
+        max_index = ino->i_size >> PAGE_CACHE_SHIFT;
+        size = PAGE_CACHE_SIZE;
+        while (index < max_index) {
+                /* Read the upcase table and copy it into the linear buffer. */
+read_partial_upcase_page:
+                page = ntfs_map_page(ino->i_mapping, index);
+                if (IS_ERR(page))
+                        goto iput_upcase_failed;
+                memcpy((char*)vol->upcase + (index++ << PAGE_CACHE_SHIFT),
+                                page_address(page), size);
+                ntfs_unmap_page(page);
+        };
+        if (size == PAGE_CACHE_SIZE) {
+                size = ino->i_size & ~PAGE_CACHE_MASK;
+                if (size)
+                        goto read_partial_upcase_page;
+        }
+        vol->upcase_len = ino->i_size >> UCHAR_T_SIZE_BITS;
+        ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).",
+                        ino->i_size, 64 * 1024 * sizeof(ntfschar));
+        iput(ino);
+        down(&ntfs_lock);
+        if (!default_upcase) {
+                ntfs_debug("Using volume specified $UpCase since default is "
+                                "not present.");
+                up(&ntfs_lock);
+                return TRUE;
+        }
+        max = default_upcase_len;
+        if (max > vol->upcase_len)
+                max = vol->upcase_len;
+        for (i = 0; i < max; i++)
+                if (vol->upcase[i] != default_upcase[i])
+                        break;
+        if (i == max) {
+                ntfs_free(vol->upcase);
+                vol->upcase = default_upcase;
+                vol->upcase_len = max;
+                ntfs_nr_upcase_users++;
+                up(&ntfs_lock);
+                ntfs_debug("Volume specified $UpCase matches default. Using "
+                                "default.");
+                return TRUE;
+        }
+        up(&ntfs_lock);
+        ntfs_debug("Using volume specified $UpCase since it does not match "
+                        "the default.");
+        return TRUE;
+iput_upcase_failed:
+        iput(ino);
+        ntfs_free(vol->upcase);
+        vol->upcase = NULL;
+upcase_failed:
+        down(&ntfs_lock);
+        if (default_upcase) {
+                vol->upcase = default_upcase;
+                vol->upcase_len = default_upcase_len;
+                ntfs_nr_upcase_users++;
+                up(&ntfs_lock);
+                ntfs_error(sb, "Failed to load $UpCase from the volume. Using "
+                                "default.");
+                return TRUE;
+        }
+        up(&ntfs_lock);
+        ntfs_error(sb, "Failed to initialize upcase table.");
+        return FALSE;
+}
+/**
+ * load_system_files - open the system files using normal functions
+ * @vol:        ntfs super block describing device whose system files to load
+ *
+ * Open the system files with normal access functions and complete setting up
+ * the ntfs super block @vol.
+ *
+ * Return TRUE on success or FALSE on error.
+ */
+static BOOL load_system_files(ntfs_volume *vol)
+{
+        struct super_block *sb = vol->sb;
+        MFT_RECORD *m;
+        VOLUME_INFORMATION *vi;
+        ntfs_attr_search_ctx *ctx;
+        ntfs_debug("Entering.");
+#ifdef NTFS_RW
+        /* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */
+        if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) {
+                static const char *es1 = "Failed to load $MFTMirr";
+                static const char *es2 = "$MFTMirr does not match $MFT";
+                static const char *es3 = ".  Run ntfsfix and/or chkdsk.";
+                /* If a read-write mount, convert it to a read-only mount. */
+                if (!(sb->s_flags & MS_RDONLY)) {
+                        if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+                                        ON_ERRORS_CONTINUE))) {
+                                ntfs_error(sb, "%s and neither on_errors="
+                                                "continue nor on_errors="
+                                                "remount-ro was specified%s",
+                                                !vol->mftmirr_ino ? es1 : es2,
+                                                es3);
+                                goto iput_mirr_err_out;
+                        }
+                        sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+                        ntfs_error(sb, "%s.  Mounting read-only%s",
+                                        !vol->mftmirr_ino ? es1 : es2, es3);
+                } else
+                        ntfs_warning(sb, "%s.  Will not be able to remount "
+                                        "read-write%s",
+                                        !vol->mftmirr_ino ? es1 : es2, es3);
+                /* This will prevent a read-write remount. */
+                NVolSetErrors(vol);
+        }
+#endif /* NTFS_RW */
+        /* Get mft bitmap attribute inode. */
+        vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0);
+        if (IS_ERR(vol->mftbmp_ino)) {
+                ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute.");
+                goto iput_mirr_err_out;
+        }
+        /* Read upcase table and setup @vol->upcase and @vol->upcase_len. */
+        if (!load_and_init_upcase(vol))
+                goto iput_mftbmp_err_out;
+#ifdef NTFS_RW
+        /*
+         * Read attribute definitions table and setup @vol->attrdef and
+         * @vol->attrdef_size.
+         */
+        if (!load_and_init_attrdef(vol))
+                goto iput_upcase_err_out;
+#endif /* NTFS_RW */
+        /*
+         * Get the cluster allocation bitmap inode and verify the size, no
+         * need for any locking at this stage as we are already running
+         * exclusively as we are mount in progress task.
+         */
+        vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap);
+        if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) {
+                if (!IS_ERR(vol->lcnbmp_ino))
+                        iput(vol->lcnbmp_ino);
+                goto bitmap_failed;
+        }
+        if ((vol->nr_clusters + 7) >> 3 > vol->lcnbmp_ino->i_size) {
+                iput(vol->lcnbmp_ino);
+bitmap_failed:
+                ntfs_error(sb, "Failed to load $Bitmap.");
+                goto iput_attrdef_err_out;
+        }
+        /*
+         * Get the volume inode and setup our cache of the volume flags and
+         * version.
+         */
+        vol->vol_ino = ntfs_iget(sb, FILE_Volume);
+        if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) {
+                if (!IS_ERR(vol->vol_ino))
+                        iput(vol->vol_ino);
+volume_failed:
+                ntfs_error(sb, "Failed to load $Volume.");
+                goto iput_lcnbmp_err_out;
+        }
+        m = map_mft_record(NTFS_I(vol->vol_ino));
+        if (IS_ERR(m)) {
+iput_volume_failed:
+                iput(vol->vol_ino);
+                goto volume_failed;
+        }
+        if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) {
+                ntfs_error(sb, "Failed to get attribute search context.");
+                goto get_ctx_vol_failed;
+        }
+        if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
+                        ctx) || ctx->attr->non_resident || ctx->attr->flags) {
+err_put_vol:
+                ntfs_attr_put_search_ctx(ctx);
+get_ctx_vol_failed:
+                unmap_mft_record(NTFS_I(vol->vol_ino));
+                goto iput_volume_failed;
+        }
+        vi = (VOLUME_INFORMATION*)((char*)ctx->attr +
+                        le16_to_cpu(ctx->attr->data.resident.value_offset));
+        /* Some bounds checks. */
+        if ((u8*)vi < (u8*)ctx->attr || (u8*)vi +
+                        le32_to_cpu(ctx->attr->data.resident.value_length) >
+                        (u8*)ctx->attr + le32_to_cpu(ctx->attr->length))
+                goto err_put_vol;
+        /* Copy the volume flags and version to the ntfs_volume structure. */
+        vol->vol_flags = vi->flags;
+        vol->major_ver = vi->major_ver;
+        vol->minor_ver = vi->minor_ver;
+        ntfs_attr_put_search_ctx(ctx);
+        unmap_mft_record(NTFS_I(vol->vol_ino));
+        printk(KERN_INFO "NTFS volume version %i.%i.\n", vol->major_ver,
+                        vol->minor_ver);
+#ifdef NTFS_RW
+        /* Make sure that no unsupported volume flags are set. */
+        if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
+                static const char *es1a = "Volume is dirty";
+                static const char *es1b = "Volume has unsupported flags set";
+                static const char *es2 = ".  Run chkdsk and mount in Windows.";
+                const char *es1;
+                
+                es1 = vol->vol_flags & VOLUME_IS_DIRTY ? es1a : es1b;
+                /* If a read-write mount, convert it to a read-only mount. */
+                if (!(sb->s_flags & MS_RDONLY)) {
+                        if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+                                        ON_ERRORS_CONTINUE))) {
+                                ntfs_error(sb, "%s and neither on_errors="
+                                                "continue nor on_errors="
+                                                "remount-ro was specified%s",
+                                                es1, es2);
+                                goto iput_vol_err_out;
+                        }
+                        sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+                        ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+                } else
+                        ntfs_warning(sb, "%s.  Will not be able to remount "
+                                        "read-write%s", es1, es2);
+                /*
+                 * Do not set NVolErrors() because ntfs_remount() re-checks the
+                 * flags which we need to do in case any flags have changed.
+                 */
+        }
+        /*
+         * Get the inode for the logfile, check it and determine if the volume
+         * was shutdown cleanly.
+         */
+        if (!load_and_check_logfile(vol) ||
+                        !ntfs_is_logfile_clean(vol->logfile_ino)) {
+                static const char *es1a = "Failed to load $LogFile";
+                static const char *es1b = "$LogFile is not clean";
+                static const char *es2 = ".  Mount in Windows.";
+                const char *es1;
+                es1 = !vol->logfile_ino ? es1a : es1b;
+                /* If a read-write mount, convert it to a read-only mount. */
+                if (!(sb->s_flags & MS_RDONLY)) {
+                        if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+                                        ON_ERRORS_CONTINUE))) {
+                                ntfs_error(sb, "%s and neither on_errors="
+                                                "continue nor on_errors="
+                                                "remount-ro was specified%s",
+                                                es1, es2);
+                                goto iput_logfile_err_out;
+                        }
+                        sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+                        ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+                } else
+                        ntfs_warning(sb, "%s.  Will not be able to remount "
+                                        "read-write%s", es1, es2);
+                /* This will prevent a read-write remount. */
+                NVolSetErrors(vol);
+        }
+        /* If (still) a read-write mount, mark the volume dirty. */
+        if (!(sb->s_flags & MS_RDONLY) &&
+                        ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
+                static const char *es1 = "Failed to set dirty bit in volume "
+                                "information flags";
+                static const char *es2 = ".  Run chkdsk.";
+                /* Convert to a read-only mount. */
+                if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+                                ON_ERRORS_CONTINUE))) {
+                        ntfs_error(sb, "%s and neither on_errors=continue nor "
+                                        "on_errors=remount-ro was specified%s",
+                                        es1, es2);
+                        goto iput_logfile_err_out;
+                }
+                ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+                sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+                /*
+                 * Do not set NVolErrors() because ntfs_remount() might manage
+                 * to set the dirty flag in which case all would be well.
+                 */
+        }
+#if 0
+        // TODO: Enable this code once we start modifying anything that is
+        //       different between NTFS 1.2 and 3.x...
+        /*
+         * If (still) a read-write mount, set the NT4 compatibility flag on
+         * newer NTFS version volumes.
+         */
+        if (!(sb->s_flags & MS_RDONLY) && (vol->major_ver > 1) &&
+                        ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
+                static const char *es1 = "Failed to set NT4 compatibility flag";
+                static const char *es2 = ".  Run chkdsk.";
+                /* Convert to a read-only mount. */
+                if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+                                ON_ERRORS_CONTINUE))) {
+                        ntfs_error(sb, "%s and neither on_errors=continue nor "
+                                        "on_errors=remount-ro was specified%s",
+                                        es1, es2);
+                        goto iput_logfile_err_out;
+                }
+                ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+                sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+                NVolSetErrors(vol);
+        }
+#endif
+        /* If (still) a read-write mount, empty the logfile. */
+        if (!(sb->s_flags & MS_RDONLY) &&
+                        !ntfs_empty_logfile(vol->logfile_ino)) {
+                static const char *es1 = "Failed to empty $LogFile";
+                static const char *es2 = ".  Mount in Windows.";
+                /* Convert to a read-only mount. */
+                if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+                                ON_ERRORS_CONTINUE))) {
+                        ntfs_error(sb, "%s and neither on_errors=continue nor "
+                                        "on_errors=remount-ro was specified%s",
+                                        es1, es2);
+                        goto iput_logfile_err_out;
+                }
+                ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+                sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+                NVolSetErrors(vol);
+        }
+#endif /* NTFS_RW */
+        /* Get the root directory inode. */
+        vol->root_ino = ntfs_iget(sb, FILE_root);
+        if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) {
+                if (!IS_ERR(vol->root_ino))
+                        iput(vol->root_ino);
+                ntfs_error(sb, "Failed to load root directory.");
+                goto iput_logfile_err_out;
+        }
+        /* If on NTFS versions before 3.0, we are done. */
+        if (vol->major_ver < 3)
+                return TRUE;
+        /* NTFS 3.0+ specific initialization. */
+        /* Get the security descriptors inode. */
+        vol->secure_ino = ntfs_iget(sb, FILE_Secure);
+        if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) {
+                if (!IS_ERR(vol->secure_ino))
+                        iput(vol->secure_ino);
+                ntfs_error(sb, "Failed to load $Secure.");
+                goto iput_root_err_out;
+        }
+        // FIXME: Initialize security.
+        /* Get the extended system files' directory inode. */
+        vol->extend_ino = ntfs_iget(sb, FILE_Extend);
+        if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) {
+                if (!IS_ERR(vol->extend_ino))
+                        iput(vol->extend_ino);
+                ntfs_error(sb, "Failed to load $Extend.");
+                goto iput_sec_err_out;
+        }
+#ifdef NTFS_RW
+        /* Find the quota file, load it if present, and set it up. */
+        if (!load_and_init_quota(vol)) {
+                static const char *es1 = "Failed to load $Quota";
+                static const char *es2 = ".  Run chkdsk.";
+                /* If a read-write mount, convert it to a read-only mount. */
+                if (!(sb->s_flags & MS_RDONLY)) {
+                        if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+                                        ON_ERRORS_CONTINUE))) {
+                                ntfs_error(sb, "%s and neither on_errors="
+                                                "continue nor on_errors="
+                                                "remount-ro was specified%s",
+                                                es1, es2);
+                                goto iput_quota_err_out;
+                        }
+                        sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+                        ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+                } else
+                        ntfs_warning(sb, "%s.  Will not be able to remount "
+                                        "read-write%s", es1, es2);
+                /* This will prevent a read-write remount. */
+                NVolSetErrors(vol);
+        }
+        /* If (still) a read-write mount, mark the quotas out of date. */
+        if (!(sb->s_flags & MS_RDONLY) &&
+                        !ntfs_mark_quotas_out_of_date(vol)) {
+                static const char *es1 = "Failed to mark quotas out of date";
+                static const char *es2 = ".  Run chkdsk.";
+                /* Convert to a read-only mount. */
+                if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+                                ON_ERRORS_CONTINUE))) {
+                        ntfs_error(sb, "%s and neither on_errors=continue nor "
+                                        "on_errors=remount-ro was specified%s",
+                                        es1, es2);
+                        goto iput_quota_err_out;
+                }
+                ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+                sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+                NVolSetErrors(vol);
+        }
+        // TODO: Delete or checkpoint the $UsnJrnl if it exists.
+#endif /* NTFS_RW */
+        return TRUE;
+#ifdef NTFS_RW
+iput_quota_err_out:
+        if (vol->quota_q_ino)
+                iput(vol->quota_q_ino);
+        if (vol->quota_ino)
+                iput(vol->quota_ino);
+        iput(vol->extend_ino);
+#endif /* NTFS_RW */
+iput_sec_err_out:
+        iput(vol->secure_ino);
+iput_root_err_out:
+        iput(vol->root_ino);
+iput_logfile_err_out:
+#ifdef NTFS_RW
+        if (vol->logfile_ino)
+                iput(vol->logfile_ino);
+iput_vol_err_out:
+#endif /* NTFS_RW */
+        iput(vol->vol_ino);
+iput_lcnbmp_err_out:
+        iput(vol->lcnbmp_ino);
+iput_attrdef_err_out:
+        vol->attrdef_size = 0;
+        if (vol->attrdef) {
+                ntfs_free(vol->attrdef);
+                vol->attrdef = NULL;
+        }
+#ifdef NTFS_RW
+iput_upcase_err_out:
+#endif /* NTFS_RW */
+        vol->upcase_len = 0;
+        down(&ntfs_lock);
+        if (vol->upcase == default_upcase) {
+                ntfs_nr_upcase_users--;
+                vol->upcase = NULL;
+        }
+        up(&ntfs_lock);
+        if (vol->upcase) {
+                ntfs_free(vol->upcase);
+                vol->upcase = NULL;
+        }
+iput_mftbmp_err_out:
+        iput(vol->mftbmp_ino);
+iput_mirr_err_out:
+#ifdef NTFS_RW
+        if (vol->mftmirr_ino)
+                iput(vol->mftmirr_ino);
+#endif /* NTFS_RW */
+        return FALSE;
+}
+/**
+ * ntfs_put_super - called by the vfs to unmount a volume
+ * @sb:         vfs superblock of volume to unmount
+ *
+ * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when
+ * the volume is being unmounted (umount system call has been invoked) and it
+ * releases all inodes and memory belonging to the NTFS specific part of the
+ * super block.
+ */
+static void ntfs_put_super(struct super_block *sb)
+{
+        ntfs_volume *vol = NTFS_SB(sb);
+        ntfs_debug("Entering.");
+#ifdef NTFS_RW
+        /*
+         * Commit all inodes while they are still open in case some of them
+         * cause others to be dirtied.
+         */
+        ntfs_commit_inode(vol->vol_ino);
+        /* NTFS 3.0+ specific. */
+        if (vol->major_ver >= 3) {
+                if (vol->quota_q_ino)
+                        ntfs_commit_inode(vol->quota_q_ino);
+                if (vol->quota_ino)
+                        ntfs_commit_inode(vol->quota_ino);
+                if (vol->extend_ino)
+                        ntfs_commit_inode(vol->extend_ino);
+                if (vol->secure_ino)
+                        ntfs_commit_inode(vol->secure_ino);
+        }
+        ntfs_commit_inode(vol->root_ino);
+        down_write(&vol->lcnbmp_lock);
+        ntfs_commit_inode(vol->lcnbmp_ino);
+        up_write(&vol->lcnbmp_lock);
+        down_write(&vol->mftbmp_lock);
+        ntfs_commit_inode(vol->mftbmp_ino);
+        up_write(&vol->mftbmp_lock);
+        if (vol->logfile_ino)
+                ntfs_commit_inode(vol->logfile_ino);
+        if (vol->mftmirr_ino)
+                ntfs_commit_inode(vol->mftmirr_ino);
+        ntfs_commit_inode(vol->mft_ino);
+        /*
+         * If a read-write mount and no volume errors have occured, mark the
+         * volume clean.  Also, re-commit all affected inodes.
+         */
+        if (!(sb->s_flags & MS_RDONLY)) {
+                if (!NVolErrors(vol)) {
+                        if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
+                                ntfs_warning(sb, "Failed to clear dirty bit "
+                                                "in volume information "
+                                                "flags.  Run chkdsk.");
+                        ntfs_commit_inode(vol->vol_ino);
+                        ntfs_commit_inode(vol->root_ino);
+                        if (vol->mftmirr_ino)
+                                ntfs_commit_inode(vol->mftmirr_ino);
+                        ntfs_commit_inode(vol->mft_ino);
+                } else {
+                        ntfs_warning(sb, "Volume has errors.  Leaving volume "
+                                        "marked dirty.  Run chkdsk.");
+                }
+        }
+#endif /* NTFS_RW */
+        iput(vol->vol_ino);
+        vol->vol_ino = NULL;
+        /* NTFS 3.0+ specific clean up. */
+        if (vol->major_ver >= 3) {
+#ifdef NTFS_RW
+                if (vol->quota_q_ino) {
+                        iput(vol->quota_q_ino);
+                        vol->quota_q_ino = NULL;
+                }
+                if (vol->quota_ino) {
+                        iput(vol->quota_ino);
+                        vol->quota_ino = NULL;
+                }
+#endif /* NTFS_RW */
+                if (vol->extend_ino) {
+                        iput(vol->extend_ino);
+                        vol->extend_ino = NULL;
+                }
+                if (vol->secure_ino) {
+                        iput(vol->secure_ino);
+                        vol->secure_ino = NULL;
+                }
+        }
+        iput(vol->root_ino);
+        vol->root_ino = NULL;
+        down_write(&vol->lcnbmp_lock);
+        iput(vol->lcnbmp_ino);
+        vol->lcnbmp_ino = NULL;
+        up_write(&vol->lcnbmp_lock);
+        down_write(&vol->mftbmp_lock);
+        iput(vol->mftbmp_ino);
+        vol->mftbmp_ino = NULL;
+        up_write(&vol->mftbmp_lock);
+#ifdef NTFS_RW
+        if (vol->logfile_ino) {
+                iput(vol->logfile_ino);
+                vol->logfile_ino = NULL;
+        }
+        if (vol->mftmirr_ino) {
+                /* Re-commit the mft mirror and mft just in case. */
+                ntfs_commit_inode(vol->mftmirr_ino);
+                ntfs_commit_inode(vol->mft_ino);
+                iput(vol->mftmirr_ino);
+                vol->mftmirr_ino = NULL;
+        }
+        /*
+         * If any dirty inodes are left, throw away all mft data page cache
+         * pages to allow a clean umount.  This should never happen any more
+         * due to mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
+         * the underlying mft records are written out and cleaned.  If it does,
+         * happen anyway, we want to know...
+         */
+        ntfs_commit_inode(vol->mft_ino);
+        write_inode_now(vol->mft_ino, 1);
+        if (!list_empty(&sb->s_dirty)) {
+                const char *s1, *s2;
+                down(&vol->mft_ino->i_sem);
+                truncate_inode_pages(vol->mft_ino->i_mapping, 0);
+                up(&vol->mft_ino->i_sem);
+                write_inode_now(vol->mft_ino, 1);
+                if (!list_empty(&sb->s_dirty)) {
+                        static const char *_s1 = "inodes";
+                        static const char *_s2 = "";
+                        s1 = _s1;
+                        s2 = _s2;
+                } else {
+                        static const char *_s1 = "mft pages";
+                        static const char *_s2 = "They have been thrown "
+                                        "away.  ";
+                        s1 = _s1;
+                        s2 = _s2;
+                }
+                ntfs_error(sb, "Dirty %s found at umount time.  %sYou should "
+                                "run chkdsk.  Please email "
+                                "linux-ntfs-dev@lists.sourceforge.net and say "
+                                "that you saw this message.  Thank you.", s1,
+                                s2);
+        }
+#endif /* NTFS_RW */
+        iput(vol->mft_ino);
+        vol->mft_ino = NULL;
+        /* Throw away the table of attribute definitions. */
+        vol->attrdef_size = 0;
+        if (vol->attrdef) {
+                ntfs_free(vol->attrdef);
+                vol->attrdef = NULL;
+        }
+        vol->upcase_len = 0;
+        /*
+         * Destroy the global default upcase table if necessary.  Also decrease
+         * the number of upcase users if we are a user.
+         */
+        down(&ntfs_lock);
+        if (vol->upcase == default_upcase) {
+                ntfs_nr_upcase_users--;
+                vol->upcase = NULL;
+        }
+        if (!ntfs_nr_upcase_users && default_upcase) {
+                ntfs_free(default_upcase);
+                default_upcase = NULL;
+        }
+        if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
+                free_compression_buffers();
+        up(&ntfs_lock);
+        if (vol->upcase) {
+                ntfs_free(vol->upcase);
+                vol->upcase = NULL;
+        }
+        if (vol->nls_map) {
+                unload_nls(vol->nls_map);
+                vol->nls_map = NULL;
+        }
+        sb->s_fs_info = NULL;
+        kfree(vol);
+        return;
+}
+/**
+ * get_nr_free_clusters - return the number of free clusters on a volume
+ * @vol:        ntfs volume for which to obtain free cluster count
+ *
+ * Calculate the number of free clusters on the mounted NTFS volume @vol. We
+ * actually calculate the number of clusters in use instead because this
+ * allows us to not care about partial pages as these will be just zero filled
+ * and hence not be counted as allocated clusters.
+ *
+ * The only particularity is that clusters beyond the end of the logical ntfs
+ * volume will be marked as allocated to prevent errors which means we have to
+ * discount those at the end. This is important as the cluster bitmap always
+ * has a size in multiples of 8 bytes, i.e. up to 63 clusters could be outside
+ * the logical volume and marked in use when they are not as they do not exist.
+ *
+ * If any pages cannot be read we assume all clusters in the erroring pages are
+ * in use. This means we return an underestimate on errors which is better than
+ * an overestimate.
+ */
+static s64 get_nr_free_clusters(ntfs_volume *vol)
+{
+        s64 nr_free = vol->nr_clusters;
+        u32 *kaddr;
+        struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
+        filler_t *readpage = (filler_t*)mapping->a_ops->readpage;
+        struct page *page;
+        unsigned long index, max_index;
+        unsigned int max_size;
+        ntfs_debug("Entering.");
+        /* Serialize accesses to the cluster bitmap. */
+        down_read(&vol->lcnbmp_lock);
+        /*
+         * Convert the number of bits into bytes rounded up, then convert into
+         * multiples of PAGE_CACHE_SIZE, rounding up so that if we have one
+         * full and one partial page max_index = 2.
+         */
+        max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_CACHE_SIZE - 1) >>
+                        PAGE_CACHE_SHIFT;
+        /* Use multiples of 4 bytes. */
+        max_size = PAGE_CACHE_SIZE >> 2;
+        ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%x.",
+                        max_index, max_size);
+        for (index = 0UL; index < max_index; index++) {
+                unsigned int i;
+                /*
+                 * Read the page from page cache, getting it from backing store
+                 * if necessary, and increment the use count.
+                 */
+                page = read_cache_page(mapping, index, (filler_t*)readpage,
+                                NULL);
+                /* Ignore pages which errored synchronously. */
+                if (IS_ERR(page)) {
+                        ntfs_debug("Sync read_cache_page() error. Skipping "
+                                        "page (index 0x%lx).", index);
+                        nr_free -= PAGE_CACHE_SIZE * 8;
+                        continue;
+                }
+                wait_on_page_locked(page);
+                /* Ignore pages which errored asynchronously. */
+                if (!PageUptodate(page)) {
+                        ntfs_debug("Async read_cache_page() error. Skipping "
+                                        "page (index 0x%lx).", index);
+                        page_cache_release(page);
+                        nr_free -= PAGE_CACHE_SIZE * 8;
+                        continue;
+                }
+                kaddr = (u32*)kmap_atomic(page, KM_USER0);
+                /*
+                 * For each 4 bytes, subtract the number of set bits. If this
+                 * is the last page and it is partial we don't really care as
+                 * it just means we do a little extra work but it won't affect
+                 * the result as all out of range bytes are set to zero by
+                 * ntfs_readpage().
+                 */
+                for (i = 0; i < max_size; i++)
+                        nr_free -= (s64)hweight32(kaddr[i]);
+                kunmap_atomic(kaddr, KM_USER0);
+                page_cache_release(page);
+        }
+        ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1);
+        /*
+         * Fixup for eventual bits outside logical ntfs volume (see function
+         * description above).
+         */
+        if (vol->nr_clusters & 63)
+                nr_free += 64 - (vol->nr_clusters & 63);
+        up_read(&vol->lcnbmp_lock);
+        /* If errors occured we may well have gone below zero, fix this. */
+        if (nr_free < 0)
+                nr_free = 0;
+        ntfs_debug("Exiting.");
+        return nr_free;
+}
+/**
+ * __get_nr_free_mft_records - return the number of free inodes on a volume
+ * @vol:        ntfs volume for which to obtain free inode count
+ *
+ * Calculate the number of free mft records (inodes) on the mounted NTFS
+ * volume @vol. We actually calculate the number of mft records in use instead
+ * because this allows us to not care about partial pages as these will be just
+ * zero filled and hence not be counted as allocated mft record.
+ *
+ * If any pages cannot be read we assume all mft records in the erroring pages
+ * are in use. This means we return an underestimate on errors which is better
+ * than an overestimate.
+ *
+ * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing.
+ */
+static unsigned long __get_nr_free_mft_records(ntfs_volume *vol)
+{
+        s64 nr_free;
+        u32 *kaddr;
+        struct address_space *mapping = vol->mftbmp_ino->i_mapping;
+        filler_t *readpage = (filler_t*)mapping->a_ops->readpage;
+        struct page *page;
+        unsigned long index, max_index;
+        unsigned int max_size;
+        ntfs_debug("Entering.");
+        /* Number of mft records in file system (at this point in time). */
+        nr_free = vol->mft_ino->i_size >> vol->mft_record_size_bits;
+        /*
+         * Convert the maximum number of set bits into bytes rounded up, then
+         * convert into multiples of PAGE_CACHE_SIZE, rounding up so that if we
+         * have one full and one partial page max_index = 2.
+         */
+        max_index = ((((NTFS_I(vol->mft_ino)->initialized_size >>
+                        vol->mft_record_size_bits) + 7) >> 3) +
+                        PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        /* Use multiples of 4 bytes. */
+        max_size = PAGE_CACHE_SIZE >> 2;
+        ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
+                        "0x%x.", max_index, max_size);
+        for (index = 0UL; index < max_index; index++) {
+                unsigned int i;
+                /*
+                 * Read the page from page cache, getting it from backing store
+                 * if necessary, and increment the use count.
+                 */
+                page = read_cache_page(mapping, index, (filler_t*)readpage,
+                                NULL);
+                /* Ignore pages which errored synchronously. */
+                if (IS_ERR(page)) {
+                        ntfs_debug("Sync read_cache_page() error. Skipping "
+                                        "page (index 0x%lx).", index);
+                        nr_free -= PAGE_CACHE_SIZE * 8;
+                        continue;
+                }
+                wait_on_page_locked(page);
+                /* Ignore pages which errored asynchronously. */
+                if (!PageUptodate(page)) {
+                        ntfs_debug("Async read_cache_page() error. Skipping "
+                                        "page (index 0x%lx).", index);
+                        page_cache_release(page);
+                        nr_free -= PAGE_CACHE_SIZE * 8;
+                        continue;
+                }
+                kaddr = (u32*)kmap_atomic(page, KM_USER0);
+                /*
+                 * For each 4 bytes, subtract the number of set bits. If this
+                 * is the last page and it is partial we don't really care as
+                 * it just means we do a little extra work but it won't affect
+                 * the result as all out of range bytes are set to zero by
+                 * ntfs_readpage().
+                 */
+                for (i = 0; i < max_size; i++)
+                        nr_free -= (s64)hweight32(kaddr[i]);
+                kunmap_atomic(kaddr, KM_USER0);
+                page_cache_release(page);
+        }
+        ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.",
+                        index - 1);
+        /* If errors occured we may well have gone below zero, fix this. */
+        if (nr_free < 0)
+                nr_free = 0;
+        ntfs_debug("Exiting.");
+        return nr_free;
+}
+/**
+ * ntfs_statfs - return information about mounted NTFS volume
+ * @sb:         super block of mounted volume
+ * @sfs:        statfs structure in which to return the information
+ *
+ * Return information about the mounted NTFS volume @sb in the statfs structure
+ * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is
+ * called). We interpret the values to be correct of the moment in time at
+ * which we are called. Most values are variable otherwise and this isn't just
+ * the free values but the totals as well. For example we can increase the
+ * total number of file nodes if we run out and we can keep doing this until
+ * there is no more space on the volume left at all.
+ *
+ * Called from vfs_statfs which is used to handle the statfs, fstatfs, and
+ * ustat system calls.
+ *
+ * Return 0 on success or -errno on error.
+ */
+static int ntfs_statfs(struct super_block *sb, struct kstatfs *sfs)
+{
+        ntfs_volume *vol = NTFS_SB(sb);
+        s64 size;
+        ntfs_debug("Entering.");
+        /* Type of filesystem. */
+        sfs->f_type   = NTFS_SB_MAGIC;
+        /* Optimal transfer block size. */
+        sfs->f_bsize  = PAGE_CACHE_SIZE;
+        /*
+         * Total data blocks in file system in units of f_bsize and since
+         * inodes are also stored in data blocs ($MFT is a file) this is just
+         * the total clusters.
+         */
+        sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >>
+                                PAGE_CACHE_SHIFT;
+        /* Free data blocks in file system in units of f_bsize. */
+        size          = get_nr_free_clusters(vol) << vol->cluster_size_bits >>
+                                PAGE_CACHE_SHIFT;
+        if (size < 0LL)
+                size = 0LL;
+        /* Free blocks avail to non-superuser, same as above on NTFS. */
+        sfs->f_bavail = sfs->f_bfree = size;
+        /* Serialize accesses to the inode bitmap. */
+        down_read(&vol->mftbmp_lock);
+        /* Number of inodes in file system (at this point in time). */
+        sfs->f_files = vol->mft_ino->i_size >> vol->mft_record_size_bits;
+        /* Free inodes in fs (based on current total count). */
+        sfs->f_ffree = __get_nr_free_mft_records(vol);
+        up_read(&vol->mftbmp_lock);
+        /*
+         * File system id. This is extremely *nix flavour dependent and even
+         * within Linux itself all fs do their own thing. I interpret this to
+         * mean a unique id associated with the mounted fs and not the id
+         * associated with the file system driver, the latter is already given
+         * by the file system type in sfs->f_type. Thus we use the 64-bit
+         * volume serial number splitting it into two 32-bit parts. We enter
+         * the least significant 32-bits in f_fsid[0] and the most significant
+         * 32-bits in f_fsid[1].
+         */
+        sfs->f_fsid.val[0] = vol->serial_no & 0xffffffff;
+        sfs->f_fsid.val[1] = (vol->serial_no >> 32) & 0xffffffff;
+        /* Maximum length of filenames. */
+        sfs->f_namelen     = NTFS_MAX_NAME_LEN;
+        return 0;
+}
+/**
+ * The complete super operations.
+ */
+static struct super_operations ntfs_sops = {
+        .alloc_inode    = ntfs_alloc_big_inode,   /* VFS: Allocate new inode. */
+        .destroy_inode  = ntfs_destroy_big_inode, /* VFS: Deallocate inode. */
+        .put_inode      = ntfs_put_inode,         /* VFS: Called just before
+                                                     the inode reference count
+                                                     is decreased. */
+#ifdef NTFS_RW
+        //.dirty_inode  = NULL,                 /* VFS: Called from
+        //                                         __mark_inode_dirty(). */
+        .write_inode    = ntfs_write_inode,     /* VFS: Write dirty inode to
+                                                   disk. */
+        //.drop_inode   = NULL,                 /* VFS: Called just after the
+        //                                         inode reference count has
+        //                                         been decreased to zero.
+        //                                         NOTE: The inode lock is
+        //                                         held. See fs/inode.c::
+        //                                         generic_drop_inode(). */
+        //.delete_inode = NULL,                 /* VFS: Delete inode from disk.
+        //                                         Called when i_count becomes
+        //                                         0 and i_nlink is also 0. */
+        //.write_super  = NULL,                 /* Flush dirty super block to
+        //                                         disk. */
+        //.sync_fs      = NULL,                 /* ? */
+        //.write_super_lockfs   = NULL,         /* ? */
+        //.unlockfs     = NULL,                 /* ? */
+#endif /* NTFS_RW */
+        .put_super      = ntfs_put_super,       /* Syscall: umount. */
+        .statfs         = ntfs_statfs,          /* Syscall: statfs */
+        .remount_fs     = ntfs_remount,         /* Syscall: mount -o remount. */
+        .clear_inode    = ntfs_clear_big_inode, /* VFS: Called when an inode is
+                                                   removed from memory. */
+        //.umount_begin = NULL,                 /* Forced umount. */
+        .show_options   = ntfs_show_options,    /* Show mount options in
+                                                   proc. */
+};
+/**
+ * Declarations for NTFS specific export operations (fs/ntfs/namei.c).
+ */
+extern struct dentry *ntfs_get_parent(struct dentry *child_dent);
+extern struct dentry *ntfs_get_dentry(struct super_block *sb, void *fh);
+/**
+ * Export operations allowing NFS exporting of mounted NTFS partitions.
+ *
+ * We use the default ->decode_fh() and ->encode_fh() for now.  Note that they
+ * use 32 bits to store the inode number which is an unsigned long so on 64-bit
+ * architectures is usually 64 bits so it would all fail horribly on huge
+ * volumes.  I guess we need to define our own encode and decode fh functions
+ * that store 64-bit inode numbers at some point but for now we will ignore the
+ * problem...
+ *
+ * We also use the default ->get_name() helper (used by ->decode_fh() via
+ * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs
+ * independent.
+ *
+ * The default ->get_parent() just returns -EACCES so we have to provide our
+ * own and the default ->get_dentry() is incompatible with NTFS due to not
+ * allowing the inode number 0 which is used in NTFS for the system file $MFT
+ * and due to using iget() whereas NTFS needs ntfs_iget().
+ */
+static struct export_operations ntfs_export_ops = {
+        .get_parent     = ntfs_get_parent,      /* Find the parent of a given
+                                                   directory. */
+        .get_dentry     = ntfs_get_dentry,      /* Find a dentry for the inode
+                                                   given a file handle
+                                                   sub-fragment. */
+};
+/**
+ * ntfs_fill_super - mount an ntfs files system
+ * @sb:         super block of ntfs file system to mount
+ * @opt:        string containing the mount options
+ * @silent:     silence error output
+ *
+ * ntfs_fill_super() is called by the VFS to mount the device described by @sb
+ * with the mount otions in @data with the NTFS file system.
+ *
+ * If @silent is true, remain silent even if errors are detected. This is used
+ * during bootup, when the kernel tries to mount the root file system with all
+ * registered file systems one after the other until one succeeds. This implies
+ * that all file systems except the correct one will quite correctly and
+ * expectedly return an error, but nobody wants to see error messages when in
+ * fact this is what is supposed to happen.
+ *
+ * NOTE: @sb->s_flags contains the mount options flags.
+ */
+static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
+{
+        ntfs_volume *vol;
+        struct buffer_head *bh;
+        struct inode *tmp_ino;
+        int result;
+        ntfs_debug("Entering.");
+#ifndef NTFS_RW
+        sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+#endif /* ! NTFS_RW */
+        /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */
+        sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS);
+        vol = NTFS_SB(sb);
+        if (!vol) {
+                if (!silent)
+                        ntfs_error(sb, "Allocation of NTFS volume structure "
+                                        "failed. Aborting mount...");
+                return -ENOMEM;
+        }
+        /* Initialize ntfs_volume structure. */
+        memset(vol, 0, sizeof(ntfs_volume));
+        vol->sb = sb;
+        vol->upcase = NULL;
+        vol->attrdef = NULL;
+        vol->mft_ino = NULL;
+        vol->mftbmp_ino = NULL;
+        init_rwsem(&vol->mftbmp_lock);
+#ifdef NTFS_RW
+        vol->mftmirr_ino = NULL;
+        vol->logfile_ino = NULL;
+#endif /* NTFS_RW */
+        vol->lcnbmp_ino = NULL;
+        init_rwsem(&vol->lcnbmp_lock);
+        vol->vol_ino = NULL;
+        vol->root_ino = NULL;
+        vol->secure_ino = NULL;
+        vol->extend_ino = NULL;
+#ifdef NTFS_RW
+        vol->quota_ino = NULL;
+        vol->quota_q_ino = NULL;
+#endif /* NTFS_RW */
+        vol->nls_map = NULL;
+        /*
+         * Default is group and other don't have any access to files or
+         * directories while owner has full access. Further, files by default
+         * are not executable but directories are of course browseable.
+         */
+        vol->fmask = 0177;
+        vol->dmask = 0077;
+        unlock_kernel();
+        /* Important to get the mount options dealt with now. */
+        if (!parse_options(vol, (char*)opt))
+                goto err_out_now;
+        /*
+         * TODO: Fail safety check. In the future we should really be able to
+         * cope with this being the case, but for now just bail out.
+         */
+        if (bdev_hardsect_size(sb->s_bdev) > NTFS_BLOCK_SIZE) {
+                if (!silent)
+                        ntfs_error(sb, "Device has unsupported hardsect_size.");
+                goto err_out_now;
+        }
+        /* Setup the device access block size to NTFS_BLOCK_SIZE. */
+        if (sb_set_blocksize(sb, NTFS_BLOCK_SIZE) != NTFS_BLOCK_SIZE) {
+                if (!silent)
+                        ntfs_error(sb, "Unable to set block size.");
+                goto err_out_now;
+        }
+        /* Get the size of the device in units of NTFS_BLOCK_SIZE bytes. */
+        vol->nr_blocks = sb->s_bdev->bd_inode->i_size >> NTFS_BLOCK_SIZE_BITS;
+        /* Read the boot sector and return unlocked buffer head to it. */
+        if (!(bh = read_ntfs_boot_sector(sb, silent))) {
+                if (!silent)
+                        ntfs_error(sb, "Not an NTFS volume.");
+                goto err_out_now;
+        }
+        /*
+         * Extract the data from the boot sector and setup the ntfs super block
+         * using it.
+         */
+        result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data);
+        /* Initialize the cluster and mft allocators. */
+        ntfs_setup_allocators(vol);
+        brelse(bh);
+        if (!result) {
+                if (!silent)
+                        ntfs_error(sb, "Unsupported NTFS filesystem.");
+                goto err_out_now;
+        }
+        /*
+         * TODO: When we start coping with sector sizes different from
+         * NTFS_BLOCK_SIZE, we now probably need to set the blocksize of the
+         * device (probably to NTFS_BLOCK_SIZE).
+         */
+        /* Setup remaining fields in the super block. */
+        sb->s_magic = NTFS_SB_MAGIC;
+        /*
+         * Ntfs allows 63 bits for the file size, i.e. correct would be:
+         *      sb->s_maxbytes = ~0ULL >> 1;
+         * But the kernel uses a long as the page cache page index which on
+         * 32-bit architectures is only 32-bits. MAX_LFS_FILESIZE is kernel
+         * defined to the maximum the page cache page index can cope with
+         * without overflowing the index or to 2^63 - 1, whichever is smaller.
+         */
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        sb->s_time_gran = 100;
+        /*
+         * Now load the metadata required for the page cache and our address
+         * space operations to function. We do this by setting up a specialised
+         * read_inode method and then just calling the normal iget() to obtain
+         * the inode for $MFT which is sufficient to allow our normal inode
+         * operations and associated address space operations to function.
+         */
+        sb->s_op = &ntfs_sops;
+        tmp_ino = new_inode(sb);
+        if (!tmp_ino) {
+                if (!silent)
+                        ntfs_error(sb, "Failed to load essential metadata.");
+                goto err_out_now;
+        }
+        tmp_ino->i_ino = FILE_MFT;
+        insert_inode_hash(tmp_ino);
+        if (ntfs_read_inode_mount(tmp_ino) < 0) {
+                if (!silent)
+                        ntfs_error(sb, "Failed to load essential metadata.");
+                goto iput_tmp_ino_err_out_now;
+        }
+        down(&ntfs_lock);
+        /*
+         * The current mount is a compression user if the cluster size is
+         * less than or equal 4kiB.
+         */
+        if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) {
+                result = allocate_compression_buffers();
+                if (result) {
+                        ntfs_error(NULL, "Failed to allocate buffers "
+                                        "for compression engine.");
+                        ntfs_nr_compression_users--;
+                        up(&ntfs_lock);
+                        goto iput_tmp_ino_err_out_now;
+                }
+        }
+        /*
+         * Generate the global default upcase table if necessary.  Also
+         * temporarily increment the number of upcase users to avoid race
+         * conditions with concurrent (u)mounts.
+         */
+        if (!default_upcase)
+                default_upcase = generate_default_upcase();
+        ntfs_nr_upcase_users++;
+        up(&ntfs_lock);
+        /*
+         * From now on, ignore @silent parameter. If we fail below this line,
+         * it will be due to a corrupt fs or a system error, so we report it.
+         */
+        /*
+         * Open the system files with normal access functions and complete
+         * setting up the ntfs super block.
+         */
+        if (!load_system_files(vol)) {
+                ntfs_error(sb, "Failed to load system files.");
+                goto unl_upcase_iput_tmp_ino_err_out_now;
+        }
+        if ((sb->s_root = d_alloc_root(vol->root_ino))) {
+                /* We increment i_count simulating an ntfs_iget(). */
+                atomic_inc(&vol->root_ino->i_count);
+                ntfs_debug("Exiting, status successful.");
+                /* Release the default upcase if it has no users. */
+                down(&ntfs_lock);
+                if (!--ntfs_nr_upcase_users && default_upcase) {
+                        ntfs_free(default_upcase);
+                        default_upcase = NULL;
+                }
+                up(&ntfs_lock);
+                sb->s_export_op = &ntfs_export_ops;
+                lock_kernel();
+                return 0;
+        }
+        ntfs_error(sb, "Failed to allocate root directory.");
+        /* Clean up after the successful load_system_files() call from above. */
+        // TODO: Use ntfs_put_super() instead of repeating all this code...
+        // FIXME: Should mark the volume clean as the error is most likely
+        //        -ENOMEM.
+        iput(vol->vol_ino);
+        vol->vol_ino = NULL;
+        /* NTFS 3.0+ specific clean up. */
+        if (vol->major_ver >= 3) {
+#ifdef NTFS_RW
+                if (vol->quota_q_ino) {
+                        iput(vol->quota_q_ino);
+                        vol->quota_q_ino = NULL;
+                }
+                if (vol->quota_ino) {
+                        iput(vol->quota_ino);
+                        vol->quota_ino = NULL;
+                }
+#endif /* NTFS_RW */
+                if (vol->extend_ino) {
+                        iput(vol->extend_ino);
+                        vol->extend_ino = NULL;
+                }
+                if (vol->secure_ino) {
+                        iput(vol->secure_ino);
+                        vol->secure_ino = NULL;
+                }
+        }
+        iput(vol->root_ino);
+        vol->root_ino = NULL;
+        iput(vol->lcnbmp_ino);
+        vol->lcnbmp_ino = NULL;
+        iput(vol->mftbmp_ino);
+        vol->mftbmp_ino = NULL;
+#ifdef NTFS_RW
+        if (vol->logfile_ino) {
+                iput(vol->logfile_ino);
+                vol->logfile_ino = NULL;
+        }
+        if (vol->mftmirr_ino) {
+                iput(vol->mftmirr_ino);
+                vol->mftmirr_ino = NULL;
+        }
+#endif /* NTFS_RW */
+        /* Throw away the table of attribute definitions. */
+        vol->attrdef_size = 0;
+        if (vol->attrdef) {
+                ntfs_free(vol->attrdef);
+                vol->attrdef = NULL;
+        }
+        vol->upcase_len = 0;
+        down(&ntfs_lock);
+        if (vol->upcase == default_upcase) {
+                ntfs_nr_upcase_users--;
+                vol->upcase = NULL;
+        }
+        up(&ntfs_lock);
+        if (vol->upcase) {
+                ntfs_free(vol->upcase);
+                vol->upcase = NULL;
+        }
+        if (vol->nls_map) {
+                unload_nls(vol->nls_map);
+                vol->nls_map = NULL;
+        }
+        /* Error exit code path. */
+unl_upcase_iput_tmp_ino_err_out_now:
+        /*
+         * Decrease the number of upcase users and destroy the global default
+         * upcase table if necessary.
+         */
+        down(&ntfs_lock);
+        if (!--ntfs_nr_upcase_users && default_upcase) {
+                ntfs_free(default_upcase);
+                default_upcase = NULL;
+        }
+        if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
+                free_compression_buffers();
+        up(&ntfs_lock);
+iput_tmp_ino_err_out_now:
+        iput(tmp_ino);
+        if (vol->mft_ino && vol->mft_ino != tmp_ino)
+                iput(vol->mft_ino);
+        vol->mft_ino = NULL;
+        /*
+         * This is needed to get ntfs_clear_extent_inode() called for each
+         * inode we have ever called ntfs_iget()/iput() on, otherwise we A)
+         * leak resources and B) a subsequent mount fails automatically due to
+         * ntfs_iget() never calling down into our ntfs_read_locked_inode()
+         * method again... FIXME: Do we need to do this twice now because of
+         * attribute inodes? I think not, so leave as is for now... (AIA)
+         */
+        if (invalidate_inodes(sb)) {
+                ntfs_error(sb, "Busy inodes left. This is most likely a NTFS "
+                                "driver bug.");
+                /* Copied from fs/super.c. I just love this message. (-; */
+                printk("NTFS: Busy inodes after umount. Self-destruct in 5 "
+                                "seconds.  Have a nice day...\n");
+        }
+        /* Errors at this stage are irrelevant. */
+err_out_now:
+        lock_kernel();
+        sb->s_fs_info = NULL;
+        kfree(vol);
+        ntfs_debug("Failed, returning -EINVAL.");
+        return -EINVAL;
+}
+/*
+ * This is a slab cache to optimize allocations and deallocations of Unicode
+ * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN
+ * (255) Unicode characters + a terminating NULL Unicode character.
+ */
+kmem_cache_t *ntfs_name_cache;
+/* Slab caches for efficient allocation/deallocation of of inodes. */
+kmem_cache_t *ntfs_inode_cache;
+kmem_cache_t *ntfs_big_inode_cache;
+/* Init once constructor for the inode slab cache. */
+static void ntfs_big_inode_init_once(void *foo, kmem_cache_t *cachep,
+                unsigned long flags)
+{
+        ntfs_inode *ni = (ntfs_inode *)foo;
+        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+                        SLAB_CTOR_CONSTRUCTOR)
+                inode_init_once(VFS_I(ni));
+}
+/*
+ * Slab caches to optimize allocations and deallocations of attribute search
+ * contexts and index contexts, respectively.
+ */
+kmem_cache_t *ntfs_attr_ctx_cache;
+kmem_cache_t *ntfs_index_ctx_cache;
+/* Driver wide semaphore. */
+DECLARE_MUTEX(ntfs_lock);
+static struct super_block *ntfs_get_sb(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *data)
+{
+        return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
+}
+static struct file_system_type ntfs_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "ntfs",
+        .get_sb         = ntfs_get_sb,
+        .kill_sb        = kill_block_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+/* Stable names for the slab caches. */
+static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache";
+static const char ntfs_attr_ctx_cache_name[] = "ntfs_attr_ctx_cache";
+static const char ntfs_name_cache_name[] = "ntfs_name_cache";
+static const char ntfs_inode_cache_name[] = "ntfs_inode_cache";
+static const char ntfs_big_inode_cache_name[] = "ntfs_big_inode_cache";
+static int __init init_ntfs_fs(void)
+{
+        int err = 0;
+        /* This may be ugly but it results in pretty output so who cares. (-8 */
+        printk(KERN_INFO "NTFS driver " NTFS_VERSION " [Flags: R/"
+#ifdef NTFS_RW
+                        "W"
+#else
+                        "O"
+#endif
+#ifdef DEBUG
+                        " DEBUG"
+#endif
+#ifdef MODULE
+                        " MODULE"
+#endif
+                        "].\n");
+        ntfs_debug("Debug messages are enabled.");
+        ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name,
+                        sizeof(ntfs_index_context), 0 /* offset */,
+                        SLAB_HWCACHE_ALIGN, NULL /* ctor */, NULL /* dtor */);
+        if (!ntfs_index_ctx_cache) {
+                printk(KERN_CRIT "NTFS: Failed to create %s!\n",
+                                ntfs_index_ctx_cache_name);
+                goto ictx_err_out;
+        }
+        ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
+                        sizeof(ntfs_attr_search_ctx), 0 /* offset */,
+                        SLAB_HWCACHE_ALIGN, NULL /* ctor */, NULL /* dtor */);
+        if (!ntfs_attr_ctx_cache) {
+                printk(KERN_CRIT "NTFS: Failed to create %s!\n",
+                                ntfs_attr_ctx_cache_name);
+                goto actx_err_out;
+        }
+        ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name,
+                        (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
+                        SLAB_HWCACHE_ALIGN, NULL, NULL);
+        if (!ntfs_name_cache) {
+                printk(KERN_CRIT "NTFS: Failed to create %s!\n",
+                                ntfs_name_cache_name);
+                goto name_err_out;
+        }
+        ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name,
+                        sizeof(ntfs_inode), 0,
+                        SLAB_RECLAIM_ACCOUNT, NULL, NULL);
+        if (!ntfs_inode_cache) {
+                printk(KERN_CRIT "NTFS: Failed to create %s!\n",
+                                ntfs_inode_cache_name);
+                goto inode_err_out;
+        }
+        ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
+                        sizeof(big_ntfs_inode), 0,
+                        SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
+                        ntfs_big_inode_init_once, NULL);
+        if (!ntfs_big_inode_cache) {
+                printk(KERN_CRIT "NTFS: Failed to create %s!\n",
+                                ntfs_big_inode_cache_name);
+                goto big_inode_err_out;
+        }
+        /* Register the ntfs sysctls. */
+        err = ntfs_sysctl(1);
+        if (err) {
+                printk(KERN_CRIT "NTFS: Failed to register NTFS sysctls!\n");
+                goto sysctl_err_out;
+        }
+        err = register_filesystem(&ntfs_fs_type);
+        if (!err) {
+                ntfs_debug("NTFS driver registered successfully.");
+                return 0; /* Success! */
+        }
+        printk(KERN_CRIT "NTFS: Failed to register NTFS file system driver!\n");
+sysctl_err_out:
+        kmem_cache_destroy(ntfs_big_inode_cache);
+big_inode_err_out:
+        kmem_cache_destroy(ntfs_inode_cache);
+inode_err_out:
+        kmem_cache_destroy(ntfs_name_cache);
+name_err_out:
+        kmem_cache_destroy(ntfs_attr_ctx_cache);
+actx_err_out:
+        kmem_cache_destroy(ntfs_index_ctx_cache);
+ictx_err_out:
+        if (!err) {
+                printk(KERN_CRIT "NTFS: Aborting NTFS file system driver "
+                                "registration...\n");
+                err = -ENOMEM;
+        }
+        return err;
+}
+static void __exit exit_ntfs_fs(void)
+{
+        int err = 0;
+        ntfs_debug("Unregistering NTFS driver.");
+        unregister_filesystem(&ntfs_fs_type);
+        if (kmem_cache_destroy(ntfs_big_inode_cache) && (err = 1))
+                printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
+                                ntfs_big_inode_cache_name);
+        if (kmem_cache_destroy(ntfs_inode_cache) && (err = 1))
+                printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
+                                ntfs_inode_cache_name);
+        if (kmem_cache_destroy(ntfs_name_cache) && (err = 1))
+                printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
+                                ntfs_name_cache_name);
+        if (kmem_cache_destroy(ntfs_attr_ctx_cache) && (err = 1))
+                printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
+                                ntfs_attr_ctx_cache_name);
+        if (kmem_cache_destroy(ntfs_index_ctx_cache) && (err = 1))
+                printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
+                                ntfs_index_ctx_cache_name);
+        if (err)
+                printk(KERN_CRIT "NTFS: This causes memory to leak! There is "
+                                "probably a BUG in the driver! Please report "
+                                "you saw this message to "
+                                "linux-ntfs-dev@lists.sourceforge.net\n");
+        /* Unregister the ntfs sysctls. */
+        ntfs_sysctl(0);
+}
+MODULE_AUTHOR("Anton Altaparmakov <aia21@cantab.net>");
+MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2004 Anton Altaparmakov");
+MODULE_VERSION(NTFS_VERSION);
+MODULE_LICENSE("GPL");
+#ifdef DEBUG
+module_param(debug_msgs, bool, 0);
+MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
+#endif
+module_init(init_ntfs_fs)
+module_exit(exit_ntfs_fs)
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
new file mode 100644
index 000000000000..75067e4f3036
--- /dev/null
+++ b/fs/ntfs/sysctl.c
@@ -0,0 +1,85 @@
+/*
+ * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of
+ *            the Linux-NTFS project. Adapted from the old NTFS driver,
+ *            Copyright (C) 1997 Martin von L�wis, R�gis Duchesne
+ *
+ * Copyright (c) 2002-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifdef DEBUG
+#include <linux/module.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+#include "sysctl.h"
+#include "debug.h"
+#define FS_NTFS 1
+/* Definition of the ntfs sysctl. */
+static ctl_table ntfs_sysctls[] = {
+        { FS_NTFS, "ntfs-debug",                /* Binary and text IDs. */
+          &debug_msgs,sizeof(debug_msgs),       /* Data pointer and size. */
+          0644, NULL, &proc_dointvec },         /* Mode, child, proc handler. */
+        { 0 }
+};
+/* Define the parent directory /proc/sys/fs. */
+static ctl_table sysctls_root[] = {
+        { CTL_FS, "fs", NULL, 0, 0555, ntfs_sysctls },
+        { 0 }
+};
+/* Storage for the sysctls header. */
+static struct ctl_table_header *sysctls_root_table = NULL;
+/**
+ * ntfs_sysctl - add or remove the debug sysctl
+ * @add:        add (1) or remove (0) the sysctl
+ *
+ * Add or remove the debug sysctl. Return 0 on success or -errno on error.
+ */
+int ntfs_sysctl(int add)
+{
+        if (add) {
+                BUG_ON(sysctls_root_table);
+                sysctls_root_table = register_sysctl_table(sysctls_root, 0);
+                if (!sysctls_root_table)
+                        return -ENOMEM;
+#ifdef CONFIG_PROC_FS
+                /*
+                 * If the proc file system is in use and we are a module, need
+                 * to set the owner of our proc entry to our module. In the
+                 * non-modular case, THIS_MODULE is NULL, so this is ok.
+                 */
+                ntfs_sysctls[0].de->owner = THIS_MODULE;
+#endif
+        } else {
+                BUG_ON(!sysctls_root_table);
+                unregister_sysctl_table(sysctls_root_table);
+                sysctls_root_table = NULL;
+        }
+        return 0;
+}
+#endif /* CONFIG_SYSCTL */
+#endif /* DEBUG */
diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h
new file mode 100644
index 000000000000..df749cc0aac8
--- /dev/null
+++ b/fs/ntfs/sysctl.h
@@ -0,0 +1,42 @@
+/*
+ * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of
+ *            the Linux-NTFS project. Adapted from the old NTFS driver,
+ *            Copyright (C) 1997 Martin von L�wis, R�gis Duchesne
+ *
+ * Copyright (c) 2002-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_SYSCTL_H
+#define _LINUX_NTFS_SYSCTL_H
+#include <linux/config.h>
+#if (DEBUG && CONFIG_SYSCTL)
+extern int ntfs_sysctl(int add);
+#else
+/* Just return success. */
+static inline int ntfs_sysctl(int add)
+{
+        return 0;
+}
+#endif /* DEBUG && CONFIG_SYSCTL */
+#endif /* _LINUX_NTFS_SYSCTL_H */
diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h
new file mode 100644
index 000000000000..a09a51dabe4e
--- /dev/null
+++ b/fs/ntfs/time.h
@@ -0,0 +1,100 @@
+/*
+ * time.h - NTFS time conversion functions.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_TIME_H
+#define _LINUX_NTFS_TIME_H
+#include <linux/time.h>         /* For current_kernel_time(). */
+#include <asm/div64.h>          /* For do_div(). */
+#include "endian.h"
+#define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000)
+/**
+ * utc2ntfs - convert Linux UTC time to NTFS time
+ * @ts:         Linux UTC time to convert to NTFS time
+ *
+ * Convert the Linux UTC time @ts to its corresponding NTFS time and return
+ * that in little endian format.
+ *
+ * Linux stores time in a struct timespec consisting of a time_t (long at
+ * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second
+ * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of
+ * 1-nano-second intervals since the value of tv_sec.
+ *
+ * NTFS uses Microsoft's standard time format which is stored in a s64 and is
+ * measured as the number of 100-nano-second intervals since 1st January 1601,
+ * 00:00:00 UTC.
+ */
+static inline sle64 utc2ntfs(const struct timespec ts)
+{
+        /*
+         * Convert the seconds to 100ns intervals, add the nano-seconds
+         * converted to 100ns intervals, and then add the NTFS time offset.
+         */
+        return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 +
+                        NTFS_TIME_OFFSET);
+}
+/**
+ * get_current_ntfs_time - get the current time in little endian NTFS format
+ *
+ * Get the current time from the Linux kernel, convert it to its corresponding
+ * NTFS time and return that in little endian format.
+ */
+static inline sle64 get_current_ntfs_time(void)
+{
+        return utc2ntfs(current_kernel_time());
+}
+/**
+ * ntfs2utc - convert NTFS time to Linux time
+ * @time:       NTFS time (little endian) to convert to Linux UTC
+ *
+ * Convert the little endian NTFS time @time to its corresponding Linux UTC
+ * time and return that in cpu format.
+ *
+ * Linux stores time in a struct timespec consisting of a time_t (long at
+ * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second
+ * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of
+ * 1-nano-second intervals since the value of tv_sec.
+ *
+ * NTFS uses Microsoft's standard time format which is stored in a s64 and is
+ * measured as the number of 100 nano-second intervals since 1st January 1601,
+ * 00:00:00 UTC.
+ */
+static inline struct timespec ntfs2utc(const sle64 time)
+{
+        struct timespec ts;
+        /* Subtract the NTFS time offset. */
+        s64 t = sle64_to_cpu(time) - NTFS_TIME_OFFSET;
+        /*
+         * Convert the time to 1-second intervals and the remainder to
+         * 1-nano-second intervals.
+         */
+        ts.tv_nsec = do_div(t, 10000000) * 100;
+        ts.tv_sec = t;
+        return ts;
+}
+#endif /* _LINUX_NTFS_TIME_H */
diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h
new file mode 100644
index 000000000000..08a55aa53d4e
--- /dev/null
+++ b/fs/ntfs/types.h
@@ -0,0 +1,66 @@
+/*
+ * types.h - Defines for NTFS Linux kernel driver specific types.
+ *           Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_TYPES_H
+#define _LINUX_NTFS_TYPES_H
+#include <linux/types.h>
+typedef __le16 le16;
+typedef __le32 le32;
+typedef __le64 le64;
+typedef __u16 __bitwise sle16;
+typedef __u32 __bitwise sle32;
+typedef __u64 __bitwise sle64;
+/* 2-byte Unicode character type. */
+typedef le16 ntfschar;
+#define UCHAR_T_SIZE_BITS 1
+/*
+ * Clusters are signed 64-bit values on NTFS volumes. We define two types, LCN
+ * and VCN, to allow for type checking and better code readability.
+ */
+typedef s64 VCN;
+typedef sle64 leVCN;
+typedef s64 LCN;
+typedef sle64 leLCN;
+/*
+ * The NTFS journal $LogFile uses log sequence numbers which are signed 64-bit
+ * values.  We define our own type LSN, to allow for type checking and better
+ * code readability.
+ */
+typedef s64 LSN;
+typedef sle64 leLSN;
+typedef enum {
+        FALSE = 0,
+        TRUE = 1
+} BOOL;
+typedef enum {
+        CASE_SENSITIVE = 0,
+        IGNORE_CASE = 1,
+} IGNORE_CASE_BOOL;
+#endif /* _LINUX_NTFS_TYPES_H */
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
new file mode 100644
index 000000000000..560b0ea255b0
--- /dev/null
+++ b/fs/ntfs/unistr.c
@@ -0,0 +1,384 @@
+/*
+ * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include "types.h"
+#include "debug.h"
+#include "ntfs.h"
+/*
+ * IMPORTANT
+ * =========
+ *
+ * All these routines assume that the Unicode characters are in little endian
+ * encoding inside the strings!!!
+ */
+/*
+ * This is used by the name collation functions to quickly determine what
+ * characters are (in)valid.
+ */
+static const u8 legal_ansi_char_array[0x40] = {
+        0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+        0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+        0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+        0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+        0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
+        0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
+        0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
+        0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
+};
+/**
+ * ntfs_are_names_equal - compare two Unicode names for equality
+ * @s1:                 name to compare to @s2
+ * @s1_len:             length in Unicode characters of @s1
+ * @s2:                 name to compare to @s1
+ * @s2_len:             length in Unicode characters of @s2
+ * @ic:                 ignore case bool
+ * @upcase:             upcase table (only if @ic == IGNORE_CASE)
+ * @upcase_size:        length in Unicode characters of @upcase (if present)
+ *
+ * Compare the names @s1 and @s2 and return TRUE (1) if the names are
+ * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
+ * the @upcase table is used to performa a case insensitive comparison.
+ */
+BOOL ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
+                const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic,
+                const ntfschar *upcase, const u32 upcase_size)
+{
+        if (s1_len != s2_len)
+                return FALSE;
+        if (ic == CASE_SENSITIVE)
+                return !ntfs_ucsncmp(s1, s2, s1_len);
+        return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size);
+}
+/**
+ * ntfs_collate_names - collate two Unicode names
+ * @name1:      first Unicode name to compare
+ * @name2:      second Unicode name to compare
+ * @err_val:    if @name1 contains an invalid character return this value
+ * @ic:         either CASE_SENSITIVE or IGNORE_CASE
+ * @upcase:     upcase table (ignored if @ic is CASE_SENSITIVE)
+ * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE)
+ *
+ * ntfs_collate_names collates two Unicode names and returns:
+ *
+ *  -1 if the first name collates before the second one,
+ *   0 if the names match,
+ *   1 if the second name collates before the first one, or
+ * @err_val if an invalid character is found in @name1 during the comparison.
+ *
+ * The following characters are considered invalid: '"', '*', '<', '>' and '?'.
+ */
+int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
+                const ntfschar *name2, const u32 name2_len,
+                const int err_val, const IGNORE_CASE_BOOL ic,
+                const ntfschar *upcase, const u32 upcase_len)
+{
+        u32 cnt, min_len;
+        u16 c1, c2;
+        min_len = name1_len;
+        if (name1_len > name2_len)
+                min_len = name2_len;
+        for (cnt = 0; cnt < min_len; ++cnt) {
+                c1 = le16_to_cpu(*name1++);
+                c2 = le16_to_cpu(*name2++);
+                if (ic) {
+                        if (c1 < upcase_len)
+                                c1 = le16_to_cpu(upcase[c1]);
+                        if (c2 < upcase_len)
+                                c2 = le16_to_cpu(upcase[c2]);
+                }
+                if (c1 < 64 && legal_ansi_char_array[c1] & 8)
+                        return err_val;
+                if (c1 < c2)
+                        return -1;
+                if (c1 > c2)
+                        return 1;
+        }
+        if (name1_len < name2_len)
+                return -1;
+        if (name1_len == name2_len)
+                return 0;
+        /* name1_len > name2_len */
+        c1 = le16_to_cpu(*name1);
+        if (c1 < 64 && legal_ansi_char_array[c1] & 8)
+                return err_val;
+        return 1;
+}
+/**
+ * ntfs_ucsncmp - compare two little endian Unicode strings
+ * @s1:         first string
+ * @s2:         second string
+ * @n:          maximum unicode characters to compare
+ *
+ * Compare the first @n characters of the Unicode strings @s1 and @s2,
+ * The strings in little endian format and appropriate le16_to_cpu()
+ * conversion is performed on non-little endian machines.
+ *
+ * The function returns an integer less than, equal to, or greater than zero
+ * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
+ * to be less than, to match, or be greater than @s2.
+ */
+int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
+{
+        u16 c1, c2;
+        size_t i;
+        for (i = 0; i < n; ++i) {
+                c1 = le16_to_cpu(s1[i]);
+                c2 = le16_to_cpu(s2[i]);
+                if (c1 < c2)
+                        return -1;
+                if (c1 > c2)
+                        return 1;
+                if (!c1)
+                        break;
+        }
+        return 0;
+}
+/**
+ * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
+ * @s1:                 first string
+ * @s2:                 second string
+ * @n:                  maximum unicode characters to compare
+ * @upcase:             upcase table
+ * @upcase_size:        upcase table size in Unicode characters
+ *
+ * Compare the first @n characters of the Unicode strings @s1 and @s2,
+ * ignoring case. The strings in little endian format and appropriate
+ * le16_to_cpu() conversion is performed on non-little endian machines.
+ *
+ * Each character is uppercased using the @upcase table before the comparison.
+ *
+ * The function returns an integer less than, equal to, or greater than zero
+ * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
+ * to be less than, to match, or be greater than @s2.
+ */
+int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
+                const ntfschar *upcase, const u32 upcase_size)
+{
+        size_t i;
+        u16 c1, c2;
+        for (i = 0; i < n; ++i) {
+                if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
+                        c1 = le16_to_cpu(upcase[c1]);
+                if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
+                        c2 = le16_to_cpu(upcase[c2]);
+                if (c1 < c2)
+                        return -1;
+                if (c1 > c2)
+                        return 1;
+                if (!c1)
+                        break;
+        }
+        return 0;
+}
+void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase,
+                const u32 upcase_len)
+{
+        u32 i;
+        u16 u;
+        for (i = 0; i < name_len; i++)
+                if ((u = le16_to_cpu(name[i])) < upcase_len)
+                        name[i] = upcase[u];
+}
+void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
+                const ntfschar *upcase, const u32 upcase_len)
+{
+        ntfs_upcase_name((ntfschar*)&file_name_attr->file_name,
+                        file_name_attr->file_name_length, upcase, upcase_len);
+}
+int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
+                FILE_NAME_ATTR *file_name_attr2,
+                const int err_val, const IGNORE_CASE_BOOL ic,
+                const ntfschar *upcase, const u32 upcase_len)
+{
+        return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name,
+                        file_name_attr1->file_name_length,
+                        (ntfschar*)&file_name_attr2->file_name,
+                        file_name_attr2->file_name_length,
+                        err_val, ic, upcase, upcase_len);
+}
+/**
+ * ntfs_nlstoucs - convert NLS string to little endian Unicode string
+ * @vol:        ntfs volume which we are working with
+ * @ins:        input NLS string buffer
+ * @ins_len:    length of input string in bytes
+ * @outs:       on return contains the allocated output Unicode string buffer
+ *
+ * Convert the input string @ins, which is in whatever format the loaded NLS
+ * map dictates, into a little endian, 2-byte Unicode string.
+ *
+ * This function allocates the string and the caller is responsible for
+ * calling kmem_cache_free(ntfs_name_cache, @outs); when finished with it.
+ *
+ * On success the function returns the number of Unicode characters written to
+ * the output string *@outs (>= 0), not counting the terminating Unicode NULL
+ * character. *@outs is set to the allocated output string buffer.
+ *
+ * On error, a negative number corresponding to the error code is returned. In
+ * that case the output string is not allocated. Both *@outs and *@outs_len
+ * are then undefined.
+ *
+ * This might look a bit odd due to fast path optimization...
+ */
+int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
+                const int ins_len, ntfschar **outs)
+{
+        struct nls_table *nls = vol->nls_map;
+        ntfschar *ucs;
+        wchar_t wc;
+        int i, o, wc_len;
+        /* We don't trust outside sources. */
+        if (ins) {
+                ucs = (ntfschar*)kmem_cache_alloc(ntfs_name_cache, SLAB_NOFS);
+                if (ucs) {
+                        for (i = o = 0; i < ins_len; i += wc_len) {
+                                wc_len = nls->char2uni(ins + i, ins_len - i,
+                                                &wc);
+                                if (wc_len >= 0) {
+                                        if (wc) {
+                                                ucs[o++] = cpu_to_le16(wc);
+                                                continue;
+                                        } /* else (!wc) */
+                                        break;
+                                } /* else (wc_len < 0) */
+                                goto conversion_err;
+                        }
+                        ucs[o] = 0;
+                        *outs = ucs;
+                        return o;
+                } /* else (!ucs) */
+                ntfs_error(vol->sb, "Failed to allocate name from "
+                                "ntfs_name_cache!");
+                return -ENOMEM;
+        } /* else (!ins) */
+        ntfs_error(NULL, "Received NULL pointer.");
+        return -EINVAL;
+conversion_err:
+        ntfs_error(vol->sb, "Name using character set %s contains characters "
+                        "that cannot be converted to Unicode.", nls->charset);
+        kmem_cache_free(ntfs_name_cache, ucs);
+        return -EILSEQ;
+}
+/**
+ * ntfs_ucstonls - convert little endian Unicode string to NLS string
+ * @vol:        ntfs volume which we are working with
+ * @ins:        input Unicode string buffer
+ * @ins_len:    length of input string in Unicode characters
+ * @outs:       on return contains the (allocated) output NLS string buffer
+ * @outs_len:   length of output string buffer in bytes
+ *
+ * Convert the input little endian, 2-byte Unicode string @ins, of length
+ * @ins_len into the string format dictated by the loaded NLS.
+ *
+ * If *@outs is NULL, this function allocates the string and the caller is
+ * responsible for calling kfree(*@outs); when finished with it. In this case
+ * @outs_len is ignored and can be 0.
+ *
+ * On success the function returns the number of bytes written to the output
+ * string *@outs (>= 0), not counting the terminating NULL byte. If the output
+ * string buffer was allocated, *@outs is set to it.
+ *
+ * On error, a negative number corresponding to the error code is returned. In
+ * that case the output string is not allocated. The contents of *@outs are
+ * then undefined.
+ *
+ * This might look a bit odd due to fast path optimization...
+ */
+int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
+                const int ins_len, unsigned char **outs, int outs_len)
+{
+        struct nls_table *nls = vol->nls_map;
+        unsigned char *ns;
+        int i, o, ns_len, wc;
+        /* We don't trust outside sources. */
+        if (ins) {
+                ns = *outs;
+                ns_len = outs_len;
+                if (ns && !ns_len) {
+                        wc = -ENAMETOOLONG;
+                        goto conversion_err;
+                }
+                if (!ns) {
+                        ns_len = ins_len * NLS_MAX_CHARSET_SIZE;
+                        ns = (unsigned char*)kmalloc(ns_len + 1, GFP_NOFS);
+                        if (!ns)
+                                goto mem_err_out;
+                }
+                for (i = o = 0; i < ins_len; i++) {
+retry:                  wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
+                                        ns_len - o);
+                        if (wc > 0) {
+                                o += wc;
+                                continue;
+                        } else if (!wc)
+                                break;
+                        else if (wc == -ENAMETOOLONG && ns != *outs) {
+                                unsigned char *tc;
+                                /* Grow in multiples of 64 bytes. */
+                                tc = (unsigned char*)kmalloc((ns_len + 64) &
+                                                ~63, GFP_NOFS);
+                                if (tc) {
+                                        memcpy(tc, ns, ns_len);
+                                        ns_len = ((ns_len + 64) & ~63) - 1;
+                                        kfree(ns);
+                                        ns = tc;
+                                        goto retry;
+                                } /* No memory so goto conversion_error; */
+                        } /* wc < 0, real error. */
+                        goto conversion_err;
+                }
+                ns[o] = 0;
+                *outs = ns;
+                return o;
+        } /* else (!ins) */
+        ntfs_error(vol->sb, "Received NULL pointer.");
+        return -EINVAL;
+conversion_err:
+        ntfs_error(vol->sb, "Unicode name contains characters that cannot be "
+                        "converted to character set %s.", nls->charset);
+        if (ns != *outs)
+                kfree(ns);
+        if (wc != -ENAMETOOLONG)
+                wc = -EILSEQ;
+        return wc;
+mem_err_out:
+        ntfs_error(vol->sb, "Failed to allocate name!");
+        return -ENOMEM;
+}
diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c
new file mode 100644
index 000000000000..879cdf1d5bd3
--- /dev/null
+++ b/fs/ntfs/upcase.c
@@ -0,0 +1,90 @@
+/*
+ * upcase.c - Generate the full NTFS Unicode upcase table in little endian.
+ *            Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org>
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ *
+ * Modified for mkntfs inclusion 9 June 2001 by Anton Altaparmakov.
+ * Modified for kernel inclusion 10 September 2001 by Anton Altparmakov.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS source
+ * in the file COPYING); if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include "malloc.h"
+#include "ntfs.h"
+ntfschar *generate_default_upcase(void)
+{
+        static const int uc_run_table[][3] = { /* Start, End, Add */
+        {0x0061, 0x007B,  -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72,  74},
+        {0x00E0, 0x00F7,  -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76,  86},
+        {0x00F8, 0x00FF,  -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
+        {0x0256, 0x0258, -205}, {0x1F00, 0x1F08,   8}, {0x1F78, 0x1F7A, 128},
+        {0x028A, 0x028C, -217}, {0x1F10, 0x1F16,   8}, {0x1F7A, 0x1F7C, 112},
+        {0x03AC, 0x03AD,  -38}, {0x1F20, 0x1F28,   8}, {0x1F7C, 0x1F7E, 126},
+        {0x03AD, 0x03B0,  -37}, {0x1F30, 0x1F38,   8}, {0x1FB0, 0x1FB2,   8},
+        {0x03B1, 0x03C2,  -32}, {0x1F40, 0x1F46,   8}, {0x1FD0, 0x1FD2,   8},
+        {0x03C2, 0x03C3,  -31}, {0x1F51, 0x1F52,   8}, {0x1FE0, 0x1FE2,   8},
+        {0x03C3, 0x03CC,  -32}, {0x1F53, 0x1F54,   8}, {0x1FE5, 0x1FE6,   7},
+        {0x03CC, 0x03CD,  -64}, {0x1F55, 0x1F56,   8}, {0x2170, 0x2180, -16},
+        {0x03CD, 0x03CF,  -63}, {0x1F57, 0x1F58,   8}, {0x24D0, 0x24EA, -26},
+        {0x0430, 0x0450,  -32}, {0x1F60, 0x1F68,   8}, {0xFF41, 0xFF5B, -32},
+        {0}
+        };
+        static const int uc_dup_table[][2] = { /* Start, End */
+        {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
+        {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
+        {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
+        {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
+        {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
+        {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
+        {0}
+        };
+        static const int uc_word_table[][2] = { /* Offset, Value */
+        {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
+        {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
+        {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
+        {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
+        {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
+        {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
+        {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
+        {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
+        {0}
+        };
+        int i, r;
+        ntfschar *uc;
+        uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar));
+        if (!uc)
+                return uc;
+        memset(uc, 0, default_upcase_len * sizeof(ntfschar));
+        for (i = 0; i < default_upcase_len; i++)
+                uc[i] = cpu_to_le16(i);
+        for (r = 0; uc_run_table[r][0]; r++)
+                for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
+                        uc[i] = cpu_to_le16((le16_to_cpu(uc[i]) +
+                                        uc_run_table[r][2]));
+        for (r = 0; uc_dup_table[r][0]; r++)
+                for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
+                        uc[i + 1] = cpu_to_le16(le16_to_cpu(uc[i + 1]) - 1);
+        for (r = 0; uc_word_table[r][0]; r++)
+                uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]);
+        return uc;
+}
diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h
new file mode 100644
index 000000000000..4b97fa8635a8
--- /dev/null
+++ b/fs/ntfs/volume.h
@@ -0,0 +1,171 @@
+/*
+ * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part
+ *            of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_NTFS_VOLUME_H
+#define _LINUX_NTFS_VOLUME_H
+#include <linux/rwsem.h>
+#include "types.h"
+#include "layout.h"
+/*
+ * The NTFS in memory super block structure.
+ */
+typedef struct {
+        /*
+         * FIXME: Reorder to have commonly used together element within the
+         * same cache line, aiming at a cache line size of 32 bytes. Aim for
+         * 64 bytes for less commonly used together elements. Put most commonly
+         * used elements to front of structure. Obviously do this only when the
+         * structure has stabilized... (AIA)
+         */
+        /* Device specifics. */
+        struct super_block *sb;         /* Pointer back to the super_block,
+                                           so we don't have to get the offset
+                                           every time. */
+        LCN nr_blocks;                  /* Number of NTFS_BLOCK_SIZE bytes
+                                           sized blocks on the device. */
+        /* Configuration provided by user at mount time. */
+        unsigned long flags;            /* Miscellaneous flags, see below. */
+        uid_t uid;                      /* uid that files will be mounted as. */
+        gid_t gid;                      /* gid that files will be mounted as. */
+        mode_t fmask;                   /* The mask for file permissions. */
+        mode_t dmask;                   /* The mask for directory
+                                           permissions. */
+        u8 mft_zone_multiplier;         /* Initial mft zone multiplier. */
+        u8 on_errors;                   /* What to do on file system errors. */
+        /* NTFS bootsector provided information. */
+        u16 sector_size;                /* in bytes */
+        u8 sector_size_bits;            /* log2(sector_size) */
+        u32 cluster_size;               /* in bytes */
+        u32 cluster_size_mask;          /* cluster_size - 1 */
+        u8 cluster_size_bits;           /* log2(cluster_size) */
+        u32 mft_record_size;            /* in bytes */
+        u32 mft_record_size_mask;       /* mft_record_size - 1 */
+        u8 mft_record_size_bits;        /* log2(mft_record_size) */
+        u32 index_record_size;          /* in bytes */
+        u32 index_record_size_mask;     /* index_record_size - 1 */
+        u8 index_record_size_bits;      /* log2(index_record_size) */
+        LCN nr_clusters;                /* Volume size in clusters == number of
+                                           bits in lcn bitmap. */
+        LCN mft_lcn;                    /* Cluster location of mft data. */
+        LCN mftmirr_lcn;                /* Cluster location of copy of mft. */
+        u64 serial_no;                  /* The volume serial number. */
+        /* Mount specific NTFS information. */
+        u32 upcase_len;                 /* Number of entries in upcase[]. */
+        ntfschar *upcase;               /* The upcase table. */
+        s32 attrdef_size;               /* Size of the attribute definition
+                                           table in bytes. */
+        ATTR_DEF *attrdef;              /* Table of attribute definitions.
+                                           Obtained from FILE_AttrDef. */
+#ifdef NTFS_RW
+        /* Variables used by the cluster and mft allocators. */
+        s64 mft_data_pos;               /* Mft record number at which to
+                                           allocate the next mft record. */
+        LCN mft_zone_start;             /* First cluster of the mft zone. */
+        LCN mft_zone_end;               /* First cluster beyond the mft zone. */
+        LCN mft_zone_pos;               /* Current position in the mft zone. */
+        LCN data1_zone_pos;             /* Current position in the first data
+                                           zone. */
+        LCN data2_zone_pos;             /* Current position in the second data
+                                           zone. */
+#endif /* NTFS_RW */
+        struct inode *mft_ino;          /* The VFS inode of $MFT. */
+        struct inode *mftbmp_ino;       /* Attribute inode for $MFT/$BITMAP. */
+        struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the
+                                            mft record bitmap ($MFT/$BITMAP). */
+#ifdef NTFS_RW
+        struct inode *mftmirr_ino;      /* The VFS inode of $MFTMirr. */
+        int mftmirr_size;               /* Size of mft mirror in mft records. */
+        struct inode *logfile_ino;      /* The VFS inode of $LogFile. */
+#endif /* NTFS_RW */
+        struct inode *lcnbmp_ino;       /* The VFS inode of $Bitmap. */
+        struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the
+                                            cluster bitmap ($Bitmap/$DATA). */
+        struct inode *vol_ino;          /* The VFS inode of $Volume. */
+        VOLUME_FLAGS vol_flags;         /* Volume flags. */
+        u8 major_ver;                   /* Ntfs major version of volume. */
+        u8 minor_ver;                   /* Ntfs minor version of volume. */
+        struct inode *root_ino;         /* The VFS inode of the root
+                                           directory. */
+        struct inode *secure_ino;       /* The VFS inode of $Secure (NTFS3.0+
+                                           only, otherwise NULL). */
+        struct inode *extend_ino;       /* The VFS inode of $Extend (NTFS3.0+
+                                           only, otherwise NULL). */
+#ifdef NTFS_RW
+        /* $Quota stuff is NTFS3.0+ specific.  Unused/NULL otherwise. */
+        struct inode *quota_ino;        /* The VFS inode of $Quota. */
+        struct inode *quota_q_ino;      /* Attribute inode for $Quota/$Q. */
+#endif /* NTFS_RW */
+        struct nls_table *nls_map;
+} ntfs_volume;
+/*
+ * Defined bits for the flags field in the ntfs_volume structure.
+ */
+typedef enum {
+        NV_Errors,              /* 1: Volume has errors, prevent remount rw. */
+        NV_ShowSystemFiles,     /* 1: Return system files in ntfs_readdir(). */
+        NV_CaseSensitive,       /* 1: Treat file names as case sensitive and
+                                      create filenames in the POSIX namespace.
+                                      Otherwise be case insensitive and create
+                                      file names in WIN32 namespace. */
+        NV_LogFileEmpty,        /* 1: $LogFile journal is empty. */
+        NV_QuotaOutOfDate,      /* 1: $Quota is out of date. */
+} ntfs_volume_flags;
+/*
+ * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo()
+ * functions.
+ */
+#define NVOL_FNS(flag)                                  \
+static inline int NVol##flag(ntfs_volume *vol)          \
+{                                                       \
+        return test_bit(NV_##flag, &(vol)->flags);      \
+}                                                       \
+static inline void NVolSet##flag(ntfs_volume *vol)      \
+{                                                       \
+        set_bit(NV_##flag, &(vol)->flags);              \
+}                                                       \
+static inline void NVolClear##flag(ntfs_volume *vol)    \
+{                                                       \
+        clear_bit(NV_##flag, &(vol)->flags);            \
+}
+/* Emit the ntfs volume bitops functions. */
+NVOL_FNS(Errors)
+NVOL_FNS(ShowSystemFiles)
+NVOL_FNS(CaseSensitive)
+NVOL_FNS(LogFileEmpty)
+NVOL_FNS(QuotaOutOfDate)
+#endif /* _LINUX_NTFS_VOLUME_H */